/* * Copyright (C) 2017-2019 Intel Corporation * * SPDX-License-Identifier: MIT * */ #pragma once #include "runtime/command_queue/command_queue.h" #include "runtime/command_queue/gpgpu_walker.h" #include "runtime/command_queue/local_id_gen.h" #include "runtime/command_stream/command_stream_receiver.h" #include "runtime/device/device_info.h" #include "runtime/event/perf_counter.h" #include "runtime/event/user_event.h" #include "runtime/helpers/aligned_memory.h" #include "runtime/helpers/debug_helpers.h" #include "runtime/helpers/hw_helper.h" #include "runtime/helpers/kernel_commands.h" #include "runtime/helpers/queue_helpers.h" #include "runtime/helpers/validators.h" #include "runtime/indirect_heap/indirect_heap.h" #include "runtime/mem_obj/mem_obj.h" #include "runtime/memory_manager/graphics_allocation.h" #include "runtime/utilities/tag_allocator.h" #include "instrumentation.h" #include #include namespace NEO { // Performs ReadModifyWrite operation on value of a register: Register = Register Operation Mask template void GpgpuWalkerHelper::addAluReadModifyWriteRegister( NEO::LinearStream *pCommandStream, uint32_t aluRegister, uint32_t operation, uint32_t mask) { // Load "Register" value into CS_GPR_R0 typedef typename GfxFamily::MI_LOAD_REGISTER_REG MI_LOAD_REGISTER_REG; typedef typename GfxFamily::MI_MATH MI_MATH; typedef typename GfxFamily::MI_MATH_ALU_INST_INLINE MI_MATH_ALU_INST_INLINE; auto pCmd = pCommandStream->getSpaceForCmd(); *pCmd = GfxFamily::cmdInitLoadRegisterReg; pCmd->setSourceRegisterAddress(aluRegister); pCmd->setDestinationRegisterAddress(CS_GPR_R0); // Load "Mask" into CS_GPR_R1 typedef typename GfxFamily::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM; auto pCmd2 = pCommandStream->getSpaceForCmd(); *pCmd2 = GfxFamily::cmdInitLoadRegisterImm; pCmd2->setRegisterOffset(CS_GPR_R1); pCmd2->setDataDword(mask); // Add instruction MI_MATH with 4 MI_MATH_ALU_INST_INLINE operands auto pCmd3 = reinterpret_cast(pCommandStream->getSpace(sizeof(MI_MATH) + NUM_ALU_INST_FOR_READ_MODIFY_WRITE * sizeof(MI_MATH_ALU_INST_INLINE))); reinterpret_cast(pCmd3)->DW0.Value = 0x0; reinterpret_cast(pCmd3)->DW0.BitField.InstructionType = MI_MATH::COMMAND_TYPE_MI_COMMAND; reinterpret_cast(pCmd3)->DW0.BitField.InstructionOpcode = MI_MATH::MI_COMMAND_OPCODE_MI_MATH; // 0x3 - 5 Dwords length cmd (-2): 1 for MI_MATH, 4 for MI_MATH_ALU_INST_INLINE reinterpret_cast(pCmd3)->DW0.BitField.DwordLength = NUM_ALU_INST_FOR_READ_MODIFY_WRITE - 1; pCmd3++; MI_MATH_ALU_INST_INLINE *pAluParam = reinterpret_cast(pCmd3); // Setup first operand of MI_MATH - load CS_GPR_R0 into register A pAluParam->DW0.BitField.ALUOpcode = ALU_OPCODE_LOAD; pAluParam->DW0.BitField.Operand1 = ALU_REGISTER_R_SRCA; pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_0; pAluParam++; // Setup second operand of MI_MATH - load CS_GPR_R1 into register B pAluParam->DW0.BitField.ALUOpcode = ALU_OPCODE_LOAD; pAluParam->DW0.BitField.Operand1 = ALU_REGISTER_R_SRCB; pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_1; pAluParam++; // Setup third operand of MI_MATH - "Operation" on registers A and B pAluParam->DW0.BitField.ALUOpcode = operation; pAluParam->DW0.BitField.Operand1 = 0; pAluParam->DW0.BitField.Operand2 = 0; pAluParam++; // Setup fourth operand of MI_MATH - store result into CS_GPR_R0 pAluParam->DW0.BitField.ALUOpcode = ALU_OPCODE_STORE; pAluParam->DW0.BitField.Operand1 = ALU_REGISTER_R_0; pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_ACCU; // LOAD value of CS_GPR_R0 into "Register" auto pCmd4 = pCommandStream->getSpaceForCmd(); *pCmd4 = GfxFamily::cmdInitLoadRegisterReg; pCmd4->setSourceRegisterAddress(CS_GPR_R0); pCmd4->setDestinationRegisterAddress(aluRegister); // Add PIPE_CONTROL to flush caches auto pCmd5 = pCommandStream->getSpaceForCmd(); *pCmd5 = GfxFamily::cmdInitPipeControl; pCmd5->setCommandStreamerStallEnable(true); pCmd5->setDcFlushEnable(true); pCmd5->setTextureCacheInvalidationEnable(true); pCmd5->setPipeControlFlushEnable(true); pCmd5->setStateCacheInvalidationEnable(true); } template void GpgpuWalkerHelper::dispatchProfilingCommandsStart( TagNode &hwTimeStamps, LinearStream *commandStream) { using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM; // PIPE_CONTROL for global timestamp uint64_t timeStampAddress = hwTimeStamps.getGpuAddress() + offsetof(HwTimeStamps, GlobalStartTS); PipeControlHelper::obtainPipeControlAndProgramPostSyncOperation(commandStream, PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP, timeStampAddress, 0llu, false); //MI_STORE_REGISTER_MEM for context local timestamp timeStampAddress = hwTimeStamps.getGpuAddress() + offsetof(HwTimeStamps, ContextStartTS); //low part auto pMICmdLow = commandStream->getSpaceForCmd(); *pMICmdLow = GfxFamily::cmdInitStoreRegisterMem; adjustMiStoreRegMemMode(pMICmdLow); pMICmdLow->setRegisterAddress(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW); pMICmdLow->setMemoryAddress(timeStampAddress); } template void GpgpuWalkerHelper::dispatchProfilingCommandsEnd( TagNode &hwTimeStamps, LinearStream *commandStream) { using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM; // PIPE_CONTROL for global timestamp auto pPipeControlCmd = commandStream->getSpaceForCmd(); *pPipeControlCmd = GfxFamily::cmdInitPipeControl; pPipeControlCmd->setCommandStreamerStallEnable(true); //MI_STORE_REGISTER_MEM for context local timestamp uint64_t timeStampAddress = hwTimeStamps.getGpuAddress() + offsetof(HwTimeStamps, ContextEndTS); //low part auto pMICmdLow = commandStream->getSpaceForCmd(); *pMICmdLow = GfxFamily::cmdInitStoreRegisterMem; adjustMiStoreRegMemMode(pMICmdLow); pMICmdLow->setRegisterAddress(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW); pMICmdLow->setMemoryAddress(timeStampAddress); } template void GpgpuWalkerHelper::dispatchStoreRegisterCommand( LinearStream *commandStream, uint64_t memoryAddress, uint32_t registerAddress) { using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM; auto pCmd = commandStream->getSpaceForCmd(); *pCmd = GfxFamily::cmdInitStoreRegisterMem; pCmd->setRegisterAddress(registerAddress); pCmd->setMemoryAddress(memoryAddress); } template void GpgpuWalkerHelper::dispatchPerfCountersGeneralPurposeCounterCommands( LinearStream *commandStream, uint64_t baseAddress) { // Read General Purpose counters for (auto i = 0u; i < NEO::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT; i++) { uint32_t regAddr = INSTR_GFX_OFFSETS::INSTR_PERF_CNT_1_DW0 + i * sizeof(cl_uint); //Gp field is 2*uint64 wide so it can hold 4 uint32 uint64_t address = baseAddress + i * sizeof(cl_uint); dispatchStoreRegisterCommand(commandStream, address, regAddr); } } template void GpgpuWalkerHelper::dispatchPerfCountersUserCounterCommands( CommandQueue &commandQueue, LinearStream *commandStream, uint64_t baseAddress) { auto userRegs = &commandQueue.getPerfCountersConfigData()->ReadRegs; for (uint32_t i = 0; i < userRegs->RegsCount; i++) { uint32_t regAddr = userRegs->Reg[i].Offset; //offset between base (low) registers is cl_ulong wide uint64_t address = baseAddress + i * sizeof(cl_ulong); dispatchStoreRegisterCommand(commandStream, address, regAddr); if (userRegs->Reg[i].BitSize > 32) { dispatchStoreRegisterCommand(commandStream, address + sizeof(cl_uint), regAddr + sizeof(cl_uint)); } } } template void GpgpuWalkerHelper::dispatchPerfCountersOABufferStateCommands( TagNode &hwPerfCounter, LinearStream *commandStream) { dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.OaStatus), INSTR_GFX_OFFSETS::INSTR_OA_STATUS); dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.OaHead), INSTR_GFX_OFFSETS::INSTR_OA_HEAD_PTR); dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.OaTail), INSTR_GFX_OFFSETS::INSTR_OA_TAIL_PTR); } template void GpgpuWalkerHelper::dispatchPerfCountersCommandsStart( CommandQueue &commandQueue, TagNode &hwPerfCounter, LinearStream *commandStream) { using MI_REPORT_PERF_COUNT = typename GfxFamily::MI_REPORT_PERF_COUNT; auto perfCounters = commandQueue.getPerfCounters(); uint32_t currentReportId = perfCounters->getCurrentReportId(); uint64_t address = 0; //flush command streamer auto pPipeControlCmd = commandStream->getSpaceForCmd(); *pPipeControlCmd = GfxFamily::cmdInitPipeControl; pPipeControlCmd->setCommandStreamerStallEnable(true); //Store value of NOOPID register GpgpuWalkerHelper::dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.DMAFenceIdBegin), INSTR_MMIO_NOOPID); //Read Core Frequency GpgpuWalkerHelper::dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.CoreFreqBegin), INSTR_MMIO_RPSTAT1); GpgpuWalkerHelper::dispatchPerfCountersGeneralPurposeCounterCommands(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportBegin.Gp)); auto pReportPerfCount = commandStream->getSpaceForCmd(); *pReportPerfCount = GfxFamily::cmdInitReportPerfCount; pReportPerfCount->setReportId(currentReportId); address = hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportBegin.Oa); pReportPerfCount->setMemoryAddress(address); address = hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWTimeStamp.GlobalStartTS); PipeControlHelper::obtainPipeControlAndProgramPostSyncOperation(commandStream, PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP, address, 0llu, false); GpgpuWalkerHelper::dispatchPerfCountersUserCounterCommands(commandQueue, commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportBegin.User)); commandQueue.sendPerfCountersConfig(); } template void GpgpuWalkerHelper::dispatchPerfCountersCommandsEnd( CommandQueue &commandQueue, TagNode &hwPerfCounter, LinearStream *commandStream) { using MI_REPORT_PERF_COUNT = typename GfxFamily::MI_REPORT_PERF_COUNT; auto perfCounters = commandQueue.getPerfCounters(); uint32_t currentReportId = perfCounters->getCurrentReportId(); //flush command streamer auto pPipeControlCmd = commandStream->getSpaceForCmd(); *pPipeControlCmd = GfxFamily::cmdInitPipeControl; pPipeControlCmd->setCommandStreamerStallEnable(true); GpgpuWalkerHelper::dispatchPerfCountersOABufferStateCommands(hwPerfCounter, commandStream); //Timestamp: Global End uint64_t address = hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWTimeStamp.GlobalEndTS); PipeControlHelper::obtainPipeControlAndProgramPostSyncOperation(commandStream, PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP, address, 0llu, false); auto pReportPerfCount = commandStream->getSpaceForCmd(); *pReportPerfCount = GfxFamily::cmdInitReportPerfCount; pReportPerfCount->setReportId(currentReportId); address = hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportEnd.Oa); pReportPerfCount->setMemoryAddress(address); GpgpuWalkerHelper::dispatchPerfCountersGeneralPurposeCounterCommands(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportEnd.Gp)); //Store value of NOOPID register GpgpuWalkerHelper::dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.DMAFenceIdEnd), INSTR_MMIO_NOOPID); //Read Core Frequency GpgpuWalkerHelper::dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.CoreFreqEnd), INSTR_MMIO_RPSTAT1); GpgpuWalkerHelper::dispatchPerfCountersUserCounterCommands(commandQueue, commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportEnd.User)); perfCounters->setCpuTimestamp(); } template void GpgpuWalkerHelper::applyWADisableLSQCROPERFforOCL(NEO::LinearStream *pCommandStream, const Kernel &kernel, bool disablePerfMode) { } template size_t GpgpuWalkerHelper::getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel) { return (size_t)0; } template void GpgpuWalkerHelper::adjustMiStoreRegMemMode(MI_STORE_REG_MEM *storeCmd) { } template size_t EnqueueOperation::getTotalSizeRequiredCS(uint32_t eventType, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo) { size_t expectedSizeCS = 0; Kernel *parentKernel = multiDispatchInfo.peekParentKernel(); if (multiDispatchInfo.peekMainKernel() && multiDispatchInfo.peekMainKernel()->isAuxTranslationRequired()) { expectedSizeCS += sizeof(PIPE_CONTROL); } for (auto &dispatchInfo : multiDispatchInfo) { expectedSizeCS += EnqueueOperation::getSizeRequiredCS(eventType, reserveProfilingCmdsSpace, reservePerfCounters, commandQueue, dispatchInfo.getKernel()); if (dispatchInfo.isPipeControlRequired()) { expectedSizeCS += sizeof(PIPE_CONTROL); } } if (parentKernel) { SchedulerKernel &scheduler = commandQueue.getDevice().getExecutionEnvironment()->getBuiltIns()->getSchedulerKernel(parentKernel->getContext()); expectedSizeCS += EnqueueOperation::getSizeRequiredCS(eventType, reserveProfilingCmdsSpace, reservePerfCounters, commandQueue, &scheduler); } if (commandQueue.getCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { expectedSizeCS += EnqueueOperation::getSizeRequiredForTimestampPacketWrite(); expectedSizeCS += TimestampPacketHelper::getRequiredCmdStreamSize(csrDeps); } return expectedSizeCS; } template size_t EnqueueOperation::getSizeRequiredCS(uint32_t cmdType, bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel) { if (isCommandWithoutKernel(cmdType)) { return EnqueueOperation::getSizeRequiredCSNonKernel(reserveProfilingCmdsSpace, reservePerfCounters, commandQueue); } else { return EnqueueOperation::getSizeRequiredCSKernel(reserveProfilingCmdsSpace, reservePerfCounters, commandQueue, pKernel); } } template size_t EnqueueOperation::getSizeRequiredCSKernel(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel) { size_t size = sizeof(typename GfxFamily::GPGPU_WALKER) + KernelCommandsHelper::getSizeRequiredCS(pKernel) + sizeof(PIPE_CONTROL) * (KernelCommandsHelper::isPipeControlWArequired() ? 2 : 1); size += KernelCommandsHelper::getSizeRequiredForCacheFlush(commandQueue, pKernel, 0U, 0U); size += PreemptionHelper::getPreemptionWaCsSize(commandQueue.getDevice()); if (reserveProfilingCmdsSpace) { size += 2 * sizeof(PIPE_CONTROL) + 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); } if (reservePerfCounters) { //start cmds //P_C: flush CS & TimeStamp BEGIN size += 2 * sizeof(PIPE_CONTROL); //SRM NOOPID & Frequency size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); //gp registers size += NEO::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); //report perf count size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT); //user registers size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); //end cmds //P_C: flush CS & TimeStamp END; size += 2 * sizeof(PIPE_CONTROL); //OA buffer (status head, tail) size += 3 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); //report perf count size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT); //gp registers size += NEO::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); //SRM NOOPID & Frequency size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); //user registers size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); } size += GpgpuWalkerHelper::getSizeForWADisableLSQCROPERFforOCL(pKernel); return size; } template size_t EnqueueOperation::getSizeRequiredCSNonKernel(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue) { size_t size = 0; if (reserveProfilingCmdsSpace) { size += 2 * sizeof(PIPE_CONTROL) + 4 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); } return size; } template size_t EnqueueOperation::getSizeRequiredForTimestampPacketWrite() { return sizeof(PIPE_CONTROL); } } // namespace NEO