/* * Copyright (C) 2017-2020 Intel Corporation * * SPDX-License-Identifier: MIT * */ #pragma once #include "shared/source/command_stream/command_stream_receiver.h" #include "shared/source/helpers/aligned_memory.h" #include "shared/source/helpers/debug_helpers.h" #include "shared/source/helpers/engine_node_helper.h" #include "shared/source/helpers/hw_helper.h" #include "shared/source/indirect_heap/indirect_heap.h" #include "shared/source/memory_manager/graphics_allocation.h" #include "shared/source/os_interface/os_context.h" #include "shared/source/utilities/tag_allocator.h" #include "opencl/source/command_queue/command_queue.h" #include "opencl/source/command_queue/command_queue_hw.h" #include "opencl/source/command_queue/gpgpu_walker.h" #include "opencl/source/command_queue/local_id_gen.h" #include "opencl/source/event/perf_counter.h" #include "opencl/source/event/user_event.h" #include "opencl/source/helpers/hardware_commands_helper.h" #include "opencl/source/helpers/queue_helpers.h" #include "opencl/source/helpers/validators.h" #include "opencl/source/mem_obj/mem_obj.h" #include "pipe_control_args.h" #include #include namespace NEO { // Performs ReadModifyWrite operation on value of a register: Register = Register Operation Mask template void GpgpuWalkerHelper::addAluReadModifyWriteRegister( LinearStream *pCommandStream, uint32_t aluRegister, AluRegisters operation, uint32_t mask) { // Load "Register" value into CS_GPR_R0 using MI_LOAD_REGISTER_REG = typename GfxFamily::MI_LOAD_REGISTER_REG; using MI_MATH = typename GfxFamily::MI_MATH; using MI_MATH_ALU_INST_INLINE = typename GfxFamily::MI_MATH_ALU_INST_INLINE; auto pCmd = pCommandStream->getSpaceForCmd(); MI_LOAD_REGISTER_REG cmdReg = GfxFamily::cmdInitLoadRegisterReg; cmdReg.setSourceRegisterAddress(aluRegister); cmdReg.setDestinationRegisterAddress(CS_GPR_R0); *pCmd = cmdReg; // Load "Mask" into CS_GPR_R1 typedef typename GfxFamily::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM; auto pCmd2 = pCommandStream->getSpaceForCmd(); MI_LOAD_REGISTER_IMM cmdImm = GfxFamily::cmdInitLoadRegisterImm; cmdImm.setRegisterOffset(CS_GPR_R1); cmdImm.setDataDword(mask); *pCmd2 = cmdImm; // Add instruction MI_MATH with 4 MI_MATH_ALU_INST_INLINE operands auto pCmd3 = reinterpret_cast(pCommandStream->getSpace(sizeof(MI_MATH) + NUM_ALU_INST_FOR_READ_MODIFY_WRITE * sizeof(MI_MATH_ALU_INST_INLINE))); MI_MATH mathCmd; mathCmd.DW0.Value = 0x0; mathCmd.DW0.BitField.InstructionType = MI_MATH::COMMAND_TYPE_MI_COMMAND; mathCmd.DW0.BitField.InstructionOpcode = MI_MATH::MI_COMMAND_OPCODE_MI_MATH; // 0x3 - 5 Dwords length cmd (-2): 1 for MI_MATH, 4 for MI_MATH_ALU_INST_INLINE mathCmd.DW0.BitField.DwordLength = NUM_ALU_INST_FOR_READ_MODIFY_WRITE - 1; *reinterpret_cast(pCmd3) = mathCmd; pCmd3++; MI_MATH_ALU_INST_INLINE *pAluParam = reinterpret_cast(pCmd3); MI_MATH_ALU_INST_INLINE cmdAluParam; cmdAluParam.DW0.Value = 0x0; // Setup first operand of MI_MATH - load CS_GPR_R0 into register A cmdAluParam.DW0.BitField.ALUOpcode = static_cast(AluRegisters::OPCODE_LOAD); cmdAluParam.DW0.BitField.Operand1 = static_cast(AluRegisters::R_SRCA); cmdAluParam.DW0.BitField.Operand2 = static_cast(AluRegisters::R_0); *pAluParam = cmdAluParam; pAluParam++; cmdAluParam.DW0.Value = 0x0; // Setup second operand of MI_MATH - load CS_GPR_R1 into register B cmdAluParam.DW0.BitField.ALUOpcode = static_cast(AluRegisters::OPCODE_LOAD); cmdAluParam.DW0.BitField.Operand1 = static_cast(AluRegisters::R_SRCB); cmdAluParam.DW0.BitField.Operand2 = static_cast(AluRegisters::R_1); *pAluParam = cmdAluParam; pAluParam++; cmdAluParam.DW0.Value = 0x0; // Setup third operand of MI_MATH - "Operation" on registers A and B cmdAluParam.DW0.BitField.ALUOpcode = static_cast(operation); cmdAluParam.DW0.BitField.Operand1 = 0; cmdAluParam.DW0.BitField.Operand2 = 0; *pAluParam = cmdAluParam; pAluParam++; cmdAluParam.DW0.Value = 0x0; // Setup fourth operand of MI_MATH - store result into CS_GPR_R0 cmdAluParam.DW0.BitField.ALUOpcode = static_cast(AluRegisters::OPCODE_STORE); cmdAluParam.DW0.BitField.Operand1 = static_cast(AluRegisters::R_0); cmdAluParam.DW0.BitField.Operand2 = static_cast(AluRegisters::R_ACCU); *pAluParam = cmdAluParam; // LOAD value of CS_GPR_R0 into "Register" auto pCmd4 = pCommandStream->getSpaceForCmd(); cmdReg = GfxFamily::cmdInitLoadRegisterReg; cmdReg.setSourceRegisterAddress(CS_GPR_R0); cmdReg.setDestinationRegisterAddress(aluRegister); *pCmd4 = cmdReg; // Add PIPE_CONTROL to flush caches auto pCmd5 = pCommandStream->getSpaceForCmd(); PIPE_CONTROL cmdPipeControl = GfxFamily::cmdInitPipeControl; cmdPipeControl.setCommandStreamerStallEnable(true); cmdPipeControl.setDcFlushEnable(true); cmdPipeControl.setTextureCacheInvalidationEnable(true); cmdPipeControl.setPipeControlFlushEnable(true); cmdPipeControl.setStateCacheInvalidationEnable(true); *pCmd5 = cmdPipeControl; } template void GpgpuWalkerHelper::dispatchPerfCountersCommandsStart( CommandQueue &commandQueue, TagNode &hwPerfCounter, LinearStream *commandStream) { const auto pPerformanceCounters = commandQueue.getPerfCounters(); const auto commandBufferType = EngineHelpers::isCcs(commandQueue.getGpgpuEngine().osContext->getEngineType()) ? MetricsLibraryApi::GpuCommandBufferType::Compute : MetricsLibraryApi::GpuCommandBufferType::Render; const uint32_t size = pPerformanceCounters->getGpuCommandsSize(commandBufferType, true); void *pBuffer = commandStream->getSpace(size); pPerformanceCounters->getGpuCommands(commandBufferType, hwPerfCounter, true, size, pBuffer); } template void GpgpuWalkerHelper::dispatchPerfCountersCommandsEnd( CommandQueue &commandQueue, TagNode &hwPerfCounter, LinearStream *commandStream) { const auto pPerformanceCounters = commandQueue.getPerfCounters(); const auto commandBufferType = EngineHelpers::isCcs(commandQueue.getGpgpuEngine().osContext->getEngineType()) ? MetricsLibraryApi::GpuCommandBufferType::Compute : MetricsLibraryApi::GpuCommandBufferType::Render; const uint32_t size = pPerformanceCounters->getGpuCommandsSize(commandBufferType, false); void *pBuffer = commandStream->getSpace(size); pPerformanceCounters->getGpuCommands(commandBufferType, hwPerfCounter, false, size, pBuffer); } template void GpgpuWalkerHelper::applyWADisableLSQCROPERFforOCL(NEO::LinearStream *pCommandStream, const Kernel &kernel, bool disablePerfMode) { } template size_t GpgpuWalkerHelper::getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel) { return (size_t)0; } template size_t EnqueueOperation::getTotalSizeRequiredCS(uint32_t eventType, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace, bool reservePerfCounters, bool blitEnqueue, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo) { size_t expectedSizeCS = 0; auto &hwInfo = commandQueue.getDevice().getHardwareInfo(); auto &commandQueueHw = static_cast &>(commandQueue); if (blitEnqueue) { size_t expectedSizeCS = TimestampPacketHelper::getRequiredCmdStreamSizeForNodeDependencyWithBlitEnqueue(); if (commandQueueHw.isCacheFlushForBcsRequired()) { expectedSizeCS += MemorySynchronizationCommands::getSizeForPipeControlWithPostSyncOperation(hwInfo); } return expectedSizeCS; } Kernel *parentKernel = multiDispatchInfo.peekParentKernel(); for (auto &dispatchInfo : multiDispatchInfo) { expectedSizeCS += EnqueueOperation::getSizeRequiredCS(eventType, reserveProfilingCmdsSpace, reservePerfCounters, commandQueue, dispatchInfo.getKernel()); size_t memObjAuxCount = multiDispatchInfo.getMemObjsForAuxTranslation() != nullptr ? multiDispatchInfo.getMemObjsForAuxTranslation()->size() : 0; expectedSizeCS += dispatchInfo.dispatchInitCommands.estimateCommandsSize(memObjAuxCount, hwInfo, commandQueueHw.isCacheFlushForBcsRequired()); expectedSizeCS += dispatchInfo.dispatchEpilogueCommands.estimateCommandsSize(memObjAuxCount, hwInfo, commandQueueHw.isCacheFlushForBcsRequired()); } if (parentKernel) { SchedulerKernel &scheduler = commandQueue.getContext().getSchedulerKernel(); expectedSizeCS += EnqueueOperation::getSizeRequiredCS(eventType, reserveProfilingCmdsSpace, reservePerfCounters, commandQueue, &scheduler); } if (commandQueue.getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { expectedSizeCS += TimestampPacketHelper::getRequiredCmdStreamSize(csrDeps); expectedSizeCS += EnqueueOperation::getSizeRequiredForTimestampPacketWrite(); } if (multiDispatchInfo.peekMainKernel()) { expectedSizeCS += EnqueueOperation::getSizeForCacheFlushAfterWalkerCommands(*multiDispatchInfo.peekMainKernel(), commandQueue); } if (DebugManager.flags.PauseOnEnqueue.get() != -1) { expectedSizeCS += MemorySynchronizationCommands::getSizeForSinglePipeControl() * 2; expectedSizeCS += sizeof(typename GfxFamily::MI_SEMAPHORE_WAIT) * 2; } return expectedSizeCS; } template size_t EnqueueOperation::getSizeRequiredCS(uint32_t cmdType, bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel) { if (isCommandWithoutKernel(cmdType)) { return EnqueueOperation::getSizeRequiredCSNonKernel(reserveProfilingCmdsSpace, reservePerfCounters, commandQueue); } else { return EnqueueOperation::getSizeRequiredCSKernel(reserveProfilingCmdsSpace, reservePerfCounters, commandQueue, pKernel); } } template size_t EnqueueOperation::getSizeRequiredCSNonKernel(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue) { size_t size = 0; if (reserveProfilingCmdsSpace) { size += 2 * sizeof(PIPE_CONTROL) + 4 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); } return size; } } // namespace NEO