/* * Copyright (C) 2018-2023 Intel Corporation * * SPDX-License-Identifier: MIT * */ #pragma once #include "shared/source/command_container/command_encoder.h" #include "shared/source/command_stream/command_stream_receiver.h" #include "shared/source/helpers/aligned_memory.h" #include "shared/source/helpers/engine_node_helper.h" #include "shared/source/helpers/gfx_core_helper.h" #include "shared/source/helpers/timestamp_packet.h" #include "shared/source/os_interface/os_context.h" #include "shared/source/utilities/tag_allocator.h" #include "opencl/source/command_queue/command_queue.h" #include "opencl/source/command_queue/command_queue_hw.h" #include "opencl/source/command_queue/gpgpu_walker.h" #include "opencl/source/event/user_event.h" #include "opencl/source/helpers/queue_helpers.h" namespace NEO { // Performs ReadModifyWrite operation on value of a register: Register = Register Operation Mask template void GpgpuWalkerHelper::addAluReadModifyWriteRegister( LinearStream *pCommandStream, uint32_t aluRegister, AluRegisters operation, uint32_t mask) { // Load "Register" value into CS_GPR_R0 using MI_LOAD_REGISTER_REG = typename GfxFamily::MI_LOAD_REGISTER_REG; using MI_MATH = typename GfxFamily::MI_MATH; using MI_MATH_ALU_INST_INLINE = typename GfxFamily::MI_MATH_ALU_INST_INLINE; auto pCmd = pCommandStream->getSpaceForCmd(); MI_LOAD_REGISTER_REG cmdReg = GfxFamily::cmdInitLoadRegisterReg; cmdReg.setSourceRegisterAddress(aluRegister); cmdReg.setDestinationRegisterAddress(CS_GPR_R0); *pCmd = cmdReg; // Load "Mask" into CS_GPR_R1 LriHelper::program(pCommandStream, CS_GPR_R1, mask, false); // Add instruction MI_MATH with 4 MI_MATH_ALU_INST_INLINE operands auto pCmd3 = reinterpret_cast(pCommandStream->getSpace(sizeof(MI_MATH) + NUM_ALU_INST_FOR_READ_MODIFY_WRITE * sizeof(MI_MATH_ALU_INST_INLINE))); MI_MATH mathCmd; mathCmd.DW0.Value = 0x0; mathCmd.DW0.BitField.InstructionType = MI_MATH::COMMAND_TYPE_MI_COMMAND; mathCmd.DW0.BitField.InstructionOpcode = MI_MATH::MI_COMMAND_OPCODE_MI_MATH; // 0x3 - 5 Dwords length cmd (-2): 1 for MI_MATH, 4 for MI_MATH_ALU_INST_INLINE mathCmd.DW0.BitField.DwordLength = NUM_ALU_INST_FOR_READ_MODIFY_WRITE - 1; *reinterpret_cast(pCmd3) = mathCmd; pCmd3++; MI_MATH_ALU_INST_INLINE *pAluParam = reinterpret_cast(pCmd3); MI_MATH_ALU_INST_INLINE cmdAluParam; cmdAluParam.DW0.Value = 0x0; // Setup first operand of MI_MATH - load CS_GPR_R0 into register A cmdAluParam.DW0.BitField.ALUOpcode = static_cast(AluRegisters::OPCODE_LOAD); cmdAluParam.DW0.BitField.Operand1 = static_cast(AluRegisters::R_SRCA); cmdAluParam.DW0.BitField.Operand2 = static_cast(AluRegisters::R_0); *pAluParam = cmdAluParam; pAluParam++; cmdAluParam.DW0.Value = 0x0; // Setup second operand of MI_MATH - load CS_GPR_R1 into register B cmdAluParam.DW0.BitField.ALUOpcode = static_cast(AluRegisters::OPCODE_LOAD); cmdAluParam.DW0.BitField.Operand1 = static_cast(AluRegisters::R_SRCB); cmdAluParam.DW0.BitField.Operand2 = static_cast(AluRegisters::R_1); *pAluParam = cmdAluParam; pAluParam++; cmdAluParam.DW0.Value = 0x0; // Setup third operand of MI_MATH - "Operation" on registers A and B cmdAluParam.DW0.BitField.ALUOpcode = static_cast(operation); cmdAluParam.DW0.BitField.Operand1 = 0; cmdAluParam.DW0.BitField.Operand2 = 0; *pAluParam = cmdAluParam; pAluParam++; cmdAluParam.DW0.Value = 0x0; // Setup fourth operand of MI_MATH - store result into CS_GPR_R0 cmdAluParam.DW0.BitField.ALUOpcode = static_cast(AluRegisters::OPCODE_STORE); cmdAluParam.DW0.BitField.Operand1 = static_cast(AluRegisters::R_0); cmdAluParam.DW0.BitField.Operand2 = static_cast(AluRegisters::R_ACCU); *pAluParam = cmdAluParam; // LOAD value of CS_GPR_R0 into "Register" auto pCmd4 = pCommandStream->getSpaceForCmd(); cmdReg = GfxFamily::cmdInitLoadRegisterReg; cmdReg.setSourceRegisterAddress(CS_GPR_R0); cmdReg.setDestinationRegisterAddress(aluRegister); *pCmd4 = cmdReg; // Add PIPE_CONTROL to flush caches auto pCmd5 = pCommandStream->getSpaceForCmd(); PIPE_CONTROL cmdPipeControl = GfxFamily::cmdInitPipeControl; cmdPipeControl.setCommandStreamerStallEnable(true); cmdPipeControl.setDcFlushEnable(true); cmdPipeControl.setTextureCacheInvalidationEnable(true); cmdPipeControl.setPipeControlFlushEnable(true); cmdPipeControl.setStateCacheInvalidationEnable(true); *pCmd5 = cmdPipeControl; } template void GpgpuWalkerHelper::dispatchPerfCountersCommandsStart( CommandQueue &commandQueue, TagNodeBase &hwPerfCounter, LinearStream *commandStream) { const auto pPerformanceCounters = commandQueue.getPerfCounters(); const auto commandBufferType = EngineHelpers::isCcs(commandQueue.getGpgpuEngine().osContext->getEngineType()) ? MetricsLibraryApi::GpuCommandBufferType::Compute : MetricsLibraryApi::GpuCommandBufferType::Render; const uint32_t size = pPerformanceCounters->getGpuCommandsSize(commandBufferType, true); void *pBuffer = commandStream->getSpace(size); pPerformanceCounters->getGpuCommands(commandBufferType, hwPerfCounter, true, size, pBuffer); } template void GpgpuWalkerHelper::dispatchPerfCountersCommandsEnd( CommandQueue &commandQueue, TagNodeBase &hwPerfCounter, LinearStream *commandStream) { const auto pPerformanceCounters = commandQueue.getPerfCounters(); const auto commandBufferType = EngineHelpers::isCcs(commandQueue.getGpgpuEngine().osContext->getEngineType()) ? MetricsLibraryApi::GpuCommandBufferType::Compute : MetricsLibraryApi::GpuCommandBufferType::Render; const uint32_t size = pPerformanceCounters->getGpuCommandsSize(commandBufferType, false); void *pBuffer = commandStream->getSpace(size); pPerformanceCounters->getGpuCommands(commandBufferType, hwPerfCounter, false, size, pBuffer); } template void GpgpuWalkerHelper::applyWADisableLSQCROPERFforOCL(NEO::LinearStream *pCommandStream, const Kernel &kernel, bool disablePerfMode) { } template size_t GpgpuWalkerHelper::getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel) { return (size_t)0; } template size_t GpgpuWalkerHelper::getSizeForWaDisableRccRhwoOptimization(const Kernel *pKernel) { return 0u; } template size_t EnqueueOperation::getTotalSizeRequiredCS(uint32_t eventType, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace, bool reservePerfCounters, bool blitEnqueue, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo, bool isMarkerWithProfiling, bool eventsInWaitlist, bool resolveDependenciesByPipecontrol, cl_event *outEvent) { size_t expectedSizeCS = 0; auto &gfxCoreHelper = commandQueue.getDevice().getGfxCoreHelper(); auto &commandQueueHw = static_cast &>(commandQueue); auto &rootDeviceEnvironment = commandQueue.getDevice().getRootDeviceEnvironment(); if (blitEnqueue) { size_t expectedSizeCS = TimestampPacketHelper::getRequiredCmdStreamSizeForNodeDependencyWithBlitEnqueue(); if (commandQueueHw.isCacheFlushForBcsRequired()) { expectedSizeCS += MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment, false); } return expectedSizeCS; } for (auto &dispatchInfo : multiDispatchInfo) { expectedSizeCS += EnqueueOperation::getSizeRequiredCS(eventType, reserveProfilingCmdsSpace, reservePerfCounters, commandQueue, dispatchInfo.getKernel(), dispatchInfo); size_t kernelObjAuxCount = multiDispatchInfo.getKernelObjsForAuxTranslation() != nullptr ? multiDispatchInfo.getKernelObjsForAuxTranslation()->size() : 0; expectedSizeCS += dispatchInfo.dispatchInitCommands.estimateCommandsSize(kernelObjAuxCount, rootDeviceEnvironment, commandQueueHw.isCacheFlushForBcsRequired()); expectedSizeCS += dispatchInfo.dispatchEpilogueCommands.estimateCommandsSize(kernelObjAuxCount, rootDeviceEnvironment, commandQueueHw.isCacheFlushForBcsRequired()); } auto relaxedOrderingEnabled = commandQueue.getGpgpuCommandStreamReceiver().directSubmissionRelaxedOrderingEnabled(); if (relaxedOrderingEnabled) { expectedSizeCS += 2 * EncodeSetMMIO::sizeREG; } if (commandQueue.getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { // add relaxed ordering cond_bb_start expectedSizeCS += TimestampPacketHelper::getRequiredCmdStreamSize(csrDeps, relaxedOrderingEnabled); expectedSizeCS += EnqueueOperation::getSizeRequiredForTimestampPacketWrite(); if (resolveDependenciesByPipecontrol) { expectedSizeCS += MemorySynchronizationCommands::getSizeForSingleBarrier(false); } if (isMarkerWithProfiling) { if (!eventsInWaitlist) { expectedSizeCS += commandQueue.getGpgpuCommandStreamReceiver().getCmdsSizeForComputeBarrierCommand(); } expectedSizeCS += 4 * EncodeStoreMMIO::size; } } else if (isMarkerWithProfiling) { expectedSizeCS += 2 * MemorySynchronizationCommands::getSizeForSingleBarrier(false); if (!gfxCoreHelper.useOnlyGlobalTimestamps()) { expectedSizeCS += 2 * EncodeStoreMMIO::size; } } if (multiDispatchInfo.peekMainKernel()) { expectedSizeCS += EnqueueOperation::getSizeForCacheFlushAfterWalkerCommands(*multiDispatchInfo.peekMainKernel(), commandQueue); } if (DebugManager.flags.PauseOnEnqueue.get() != -1) { expectedSizeCS += MemorySynchronizationCommands::getSizeForSingleBarrier(false) * 2; expectedSizeCS += NEO::EncodeSemaphore::getSizeMiSemaphoreWait() * 2; } if (DebugManager.flags.GpuScratchRegWriteAfterWalker.get() != -1) { expectedSizeCS += sizeof(typename GfxFamily::MI_LOAD_REGISTER_IMM); } expectedSizeCS += TimestampPacketHelper::getRequiredCmdStreamSizeForMultiRootDeviceSyncNodesContainer(csrDeps); if (outEvent) { auto pEvent = castToObjectOrAbort(*outEvent); if ((pEvent->getContext()->getRootDeviceIndices().size() > 1) && (!pEvent->isUserEvent())) { expectedSizeCS += MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment, false); } } expectedSizeCS += MemorySynchronizationCommands::getSizeForSingleBarrier(false); if ((CL_COMMAND_BARRIER == eventType) && !commandQueue.isOOQEnabled() && eventsInWaitlist) { expectedSizeCS += EncodeStoreMemory::getStoreDataImmSize(); } return expectedSizeCS; } template size_t EnqueueOperation::getSizeRequiredCS(uint32_t cmdType, bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel, const DispatchInfo &dispatchInfo) { if (isCommandWithoutKernel(cmdType)) { return EnqueueOperation::getSizeRequiredCSNonKernel(reserveProfilingCmdsSpace, reservePerfCounters, commandQueue); } else { return EnqueueOperation::getSizeRequiredCSKernel(reserveProfilingCmdsSpace, reservePerfCounters, commandQueue, pKernel, dispatchInfo); } } template size_t EnqueueOperation::getSizeRequiredCSNonKernel(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue) { size_t size = 0; if (reserveProfilingCmdsSpace) { size += 2 * MemorySynchronizationCommands::getSizeForSingleBarrier(false) + 4 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); } return size; } } // namespace NEO