/* * Copyright (C) 2018-2023 Intel Corporation * * SPDX-License-Identifier: MIT * */ #pragma once #include "shared/source/command_container/command_encoder.h" #include "shared/source/direct_submission/relaxed_ordering_helper.h" #include "shared/source/helpers/gfx_core_helper.h" #include "shared/source/helpers/pause_on_gpu_properties.h" #include "shared/source/helpers/pipe_control_args.h" #include "shared/source/memory_manager/internal_allocation_storage.h" #include "opencl/source/command_queue/gpgpu_walker.h" #include "opencl/source/command_queue/hardware_interface.h" #include "opencl/source/helpers/cl_preemption_helper.h" #include "opencl/source/helpers/hardware_commands_helper.h" #include "opencl/source/helpers/task_information.h" #include "opencl/source/mem_obj/buffer.h" namespace NEO { template template inline WalkerType *HardwareInterface::allocateWalkerSpace(LinearStream &commandStream, const Kernel &kernel) { auto walkerCmd = commandStream.getSpaceForCmd(); return walkerCmd; } template inline void HardwareInterface::dispatchWalkerCommon(CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo, const CsrDependencies &csrDependencies, HardwareInterfaceWalkerArgs &walkerArgs) { dispatchWalker(commandQueue, multiDispatchInfo, csrDependencies, walkerArgs); } template inline void HardwareInterface::dispatchProfilingPerfStartCommands( TagNodeBase *hwTimeStamps, TagNodeBase *hwPerfCounter, LinearStream *commandStream, CommandQueue &commandQueue) { // If hwTimeStampAlloc is passed (not nullptr), then we know that profiling is enabled if (hwTimeStamps != nullptr) { GpgpuWalkerHelper::dispatchProfilingCommandsStart(*hwTimeStamps, commandStream, commandQueue.getDevice().getRootDeviceEnvironment()); } if (hwPerfCounter != nullptr) { GpgpuWalkerHelper::dispatchPerfCountersCommandsStart(commandQueue, *hwPerfCounter, commandStream); } } template inline void HardwareInterface::dispatchProfilingPerfEndCommands( TagNodeBase *hwTimeStamps, TagNodeBase *hwPerfCounter, LinearStream *commandStream, CommandQueue &commandQueue) { // If hwTimeStamps is passed (not nullptr), then we know that profiling is enabled if (hwTimeStamps != nullptr) { GpgpuWalkerHelper::dispatchProfilingCommandsEnd(*hwTimeStamps, commandStream, commandQueue.getDevice().getRootDeviceEnvironment()); } if (hwPerfCounter != nullptr) { GpgpuWalkerHelper::dispatchPerfCountersCommandsEnd(commandQueue, *hwPerfCounter, commandStream); } } template template void HardwareInterface::dispatchWalker( CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo, const CsrDependencies &csrDependencies, HardwareInterfaceWalkerArgs &walkerArgs) { LinearStream *commandStream = nullptr; IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr; auto mainKernel = multiDispatchInfo.peekMainKernel(); walkerArgs.preemptionMode = ClPreemptionHelper::taskPreemptionMode(commandQueue.getDevice(), multiDispatchInfo); for (auto &dispatchInfo : multiDispatchInfo) { // Compute local workgroup sizes if (dispatchInfo.getLocalWorkgroupSize().x == 0) { const auto lws = generateWorkgroupSize(dispatchInfo); const_cast(dispatchInfo).setLWS(lws); } } // Allocate command stream and indirect heaps bool blockedQueue = (walkerArgs.blockedCommandsData != nullptr); obtainIndirectHeaps(commandQueue, multiDispatchInfo, blockedQueue, dsh, ioh, ssh); if (blockedQueue) { walkerArgs.blockedCommandsData->setHeaps(dsh, ioh, ssh); commandStream = walkerArgs.blockedCommandsData->commandStream.get(); } else { commandStream = &commandQueue.getCS(0); } if (commandQueue.getDevice().getDebugger()) { auto debugSurface = commandQueue.getGpgpuCommandStreamReceiver().getDebugSurfaceAllocation(); void *addressToPatch = reinterpret_cast(debugSurface->getGpuAddress()); size_t sizeToPatch = debugSurface->getUnderlyingBufferSize(); Buffer::setSurfaceState(&commandQueue.getDevice(), commandQueue.getDevice().getDebugger()->getDebugSurfaceReservedSurfaceState(*ssh), false, false, sizeToPatch, addressToPatch, 0, debugSurface, 0, 0, mainKernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, mainKernel->areMultipleSubDevicesInContext()); } if (walkerArgs.relaxedOrderingEnabled) { RelaxedOrderingHelper::encodeRegistersBeforeDependencyCheckers(*commandStream); } TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer(*commandStream, csrDependencies, walkerArgs.relaxedOrderingEnabled); dsh->align(EncodeStates::alignInterfaceDescriptorData); walkerArgs.interfaceDescriptorIndex = 0; walkerArgs.offsetInterfaceDescriptorTable = dsh->getUsed(); size_t totalInterfaceDescriptorTableSize = GfxFamily::template getInterfaceDescriptorSize(); getDefaultDshSpace(walkerArgs.offsetInterfaceDescriptorTable, commandQueue, multiDispatchInfo, totalInterfaceDescriptorTableSize, dsh, commandStream); // Program media interface descriptor load HardwareCommandsHelper::sendMediaInterfaceDescriptorLoad( *commandStream, walkerArgs.offsetInterfaceDescriptorTable, totalInterfaceDescriptorTableSize); DEBUG_BREAK_IF(walkerArgs.offsetInterfaceDescriptorTable % 64 != 0); dispatchProfilingPerfStartCommands(walkerArgs.hwTimeStamps, walkerArgs.hwPerfCounter, commandStream, commandQueue); const auto &hwInfo = commandQueue.getDevice().getHardwareInfo(); if (PauseOnGpuProperties::pauseModeAllowed(debugManager.flags.PauseOnEnqueue.get(), commandQueue.getGpgpuCommandStreamReceiver().peekTaskCount(), PauseOnGpuProperties::PauseMode::BeforeWorkload)) { dispatchDebugPauseCommands(commandStream, commandQueue, DebugPauseState::waitingForUserStartConfirmation, DebugPauseState::hasUserStartConfirmation, hwInfo); } mainKernel->performKernelTuning(commandQueue.getGpgpuCommandStreamReceiver(), multiDispatchInfo.begin()->getLocalWorkgroupSize(), multiDispatchInfo.begin()->getActualWorkgroupSize(), multiDispatchInfo.begin()->getOffset(), walkerArgs.currentTimestampPacketNodes); walkerArgs.currentDispatchIndex = 0; for (auto &dispatchInfo : multiDispatchInfo) { dispatchInfo.dispatchInitCommands(*commandStream, walkerArgs.timestampPacketDependencies, commandQueue.getDevice().getRootDeviceEnvironment()); walkerArgs.isMainKernel = (dispatchInfo.getKernel() == mainKernel); dispatchKernelCommands(commandQueue, dispatchInfo, *commandStream, *dsh, *ioh, *ssh, walkerArgs); walkerArgs.currentDispatchIndex++; dispatchInfo.dispatchEpilogueCommands(*commandStream, walkerArgs.timestampPacketDependencies, commandQueue.getDevice().getRootDeviceEnvironment()); } if (PauseOnGpuProperties::gpuScratchRegWriteAllowed(debugManager.flags.GpuScratchRegWriteAfterWalker.get(), commandQueue.getGpgpuCommandStreamReceiver().peekTaskCount())) { uint32_t registerOffset = debugManager.flags.GpuScratchRegWriteRegisterOffset.get(); uint32_t registerData = debugManager.flags.GpuScratchRegWriteRegisterData.get(); LriHelper::program(commandStream, registerOffset, registerData, EncodeSetMMIO::isRemapApplicable(registerOffset)); } if (PauseOnGpuProperties::pauseModeAllowed(debugManager.flags.PauseOnEnqueue.get(), commandQueue.getGpgpuCommandStreamReceiver().peekTaskCount(), PauseOnGpuProperties::PauseMode::AfterWorkload)) { dispatchDebugPauseCommands(commandStream, commandQueue, DebugPauseState::waitingForUserEndConfirmation, DebugPauseState::hasUserEndConfirmation, hwInfo); } dispatchProfilingPerfEndCommands(walkerArgs.hwTimeStamps, walkerArgs.hwPerfCounter, commandStream, commandQueue); } template template void HardwareInterface::dispatchKernelCommands(CommandQueue &commandQueue, const DispatchInfo &dispatchInfo, LinearStream &commandStream, IndirectHeap &dsh, IndirectHeap &ioh, IndirectHeap &ssh, HardwareInterfaceWalkerArgs &walkerArgs) { auto &kernel = *dispatchInfo.getKernel(); DEBUG_BREAK_IF(!(dispatchInfo.getDim() >= 1 && dispatchInfo.getDim() <= 3)); DEBUG_BREAK_IF(!(dispatchInfo.getGWS().z == 1 || dispatchInfo.getDim() == 3)); DEBUG_BREAK_IF(!(dispatchInfo.getGWS().y == 1 || dispatchInfo.getDim() >= 2)); DEBUG_BREAK_IF(!(dispatchInfo.getOffset().z == 0 || dispatchInfo.getDim() == 3)); DEBUG_BREAK_IF(!(dispatchInfo.getOffset().y == 0 || dispatchInfo.getDim() >= 2)); // If we don't have a required WGS, compute one opportunistically if (walkerArgs.commandType == CL_COMMAND_NDRANGE_KERNEL) { provideLocalWorkGroupSizeHints(commandQueue.getContextPtr(), dispatchInfo); } // Get dispatch geometry auto dim = dispatchInfo.getDim(); const auto &gws = dispatchInfo.getGWS(); const auto &offset = dispatchInfo.getOffset(); walkerArgs.startOfWorkgroups = &dispatchInfo.getStartOfWorkgroups(); // Compute local workgroup sizes const auto &lws = dispatchInfo.getLocalWorkgroupSize(); const auto &elws = (dispatchInfo.getEnqueuedWorkgroupSize().x > 0) ? dispatchInfo.getEnqueuedWorkgroupSize() : lws; // Compute number of work groups const auto &totalNumberOfWorkgroups = dispatchInfo.getTotalNumberOfWorkgroups(); walkerArgs.numberOfWorkgroups = &dispatchInfo.getNumberOfWorkgroups(); UNRECOVERABLE_IF(totalNumberOfWorkgroups.x == 0); UNRECOVERABLE_IF(walkerArgs.numberOfWorkgroups->x == 0); walkerArgs.globalWorkSizes[0] = gws.x; walkerArgs.globalWorkSizes[1] = gws.y; walkerArgs.globalWorkSizes[2] = gws.z; // Patch our kernel constants kernel.setGlobalWorkOffsetValues(static_cast(offset.x), static_cast(offset.y), static_cast(offset.z)); kernel.setGlobalWorkSizeValues(static_cast(gws.x), static_cast(gws.y), static_cast(gws.z)); if (walkerArgs.isMainKernel || (!kernel.isLocalWorkSize2Patchable())) { kernel.setLocalWorkSizeValues(static_cast(lws.x), static_cast(lws.y), static_cast(lws.z)); } kernel.setLocalWorkSize2Values(static_cast(lws.x), static_cast(lws.y), static_cast(lws.z)); kernel.setEnqueuedLocalWorkSizeValues(static_cast(elws.x), static_cast(elws.y), static_cast(elws.z)); if (walkerArgs.isMainKernel) { kernel.setNumWorkGroupsValues(static_cast(totalNumberOfWorkgroups.x), static_cast(totalNumberOfWorkgroups.y), static_cast(totalNumberOfWorkgroups.z)); } kernel.setWorkDim(dim); // Send our indirect object data walkerArgs.localWorkSizes[0] = lws.x; walkerArgs.localWorkSizes[1] = lws.y; walkerArgs.localWorkSizes[2] = lws.z; dispatchWorkarounds(&commandStream, commandQueue, kernel, true); programWalker(commandStream, kernel, commandQueue, dsh, ioh, ssh, dispatchInfo, walkerArgs); dispatchWorkarounds(&commandStream, commandQueue, kernel, false); } template void HardwareInterface::obtainIndirectHeaps(CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo, bool blockedQueue, IndirectHeap *&dsh, IndirectHeap *&ioh, IndirectHeap *&ssh) { if (blockedQueue) { size_t dshSize = 0; size_t colorCalcSize = 0; size_t sshSize = HardwareCommandsHelper::getTotalSizeRequiredSSH(multiDispatchInfo); dshSize = HardwareCommandsHelper::getTotalSizeRequiredDSH(multiDispatchInfo); commandQueue.allocateHeapMemory(IndirectHeap::Type::DYNAMIC_STATE, dshSize, dsh); dsh->getSpace(colorCalcSize); commandQueue.allocateHeapMemory(IndirectHeap::Type::SURFACE_STATE, sshSize, ssh); commandQueue.allocateHeapMemory(IndirectHeap::Type::INDIRECT_OBJECT, HardwareCommandsHelper::getTotalSizeRequiredIOH(multiDispatchInfo), ioh); } else { dsh = &getIndirectHeap(commandQueue, multiDispatchInfo); ioh = &getIndirectHeap(commandQueue, multiDispatchInfo); ssh = &getIndirectHeap(commandQueue, multiDispatchInfo); } } template inline void HardwareInterface::dispatchDebugPauseCommands( LinearStream *commandStream, CommandQueue &commandQueue, DebugPauseState confirmationTrigger, DebugPauseState waitCondition, const HardwareInfo &hwInfo) { if (!commandQueue.isSpecial()) { auto address = commandQueue.getGpgpuCommandStreamReceiver().getDebugPauseStateGPUAddress(); { PipeControlArgs args; args.dcFlushEnable = MemorySynchronizationCommands::getDcFlushEnable(true, commandQueue.getDevice().getRootDeviceEnvironment()); MemorySynchronizationCommands::addBarrierWithPostSyncOperation( *commandStream, PostSyncMode::ImmediateData, address, static_cast(confirmationTrigger), commandQueue.getDevice().getRootDeviceEnvironment(), args); } { using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION; EncodeSemaphore::addMiSemaphoreWaitCommand(*commandStream, address, static_cast(waitCondition), COMPARE_OPERATION::COMPARE_OPERATION_SAD_EQUAL_SDD, false, false, false); } } } } // namespace NEO