From 603eee76e5b153fd5e7ca2c9ef37a7f4ce49d8e6 Mon Sep 17 00:00:00 2001 From: Piotr Fusik Date: Thu, 18 Apr 2019 12:25:29 +0200 Subject: [PATCH] Use GPU pointers for HwPerfCounter. Related-To: NEO-2872 Change-Id: Ia30f2ee0d96a3da05b8e5ecf55e9b7fb5a34ace7 Signed-off-by: Piotr Fusik --- runtime/command_queue/command_queue.h | 8 +- runtime/command_queue/enqueue_common.h | 7 +- runtime/command_queue/gpgpu_walker.h | 80 ++++---- runtime/command_queue/gpgpu_walker.inl | 182 ++++++------------ runtime/command_queue/hardware_interface.h | 6 +- runtime/command_queue/hardware_interface.inl | 2 +- .../command_queue/hardware_interface_base.inl | 4 +- runtime/device_queue/device_queue_hw.inl | 1 - unit_tests/event/event_tests.cpp | 9 +- .../libult/ult_command_stream_receiver.h | 2 + unit_tests/profiling/profiling_tests.cpp | 98 ++++++++++ 11 files changed, 209 insertions(+), 190 deletions(-) diff --git a/runtime/command_queue/command_queue.h b/runtime/command_queue/command_queue.h index 32f1e716c8..0c414d0c97 100644 --- a/runtime/command_queue/command_queue.h +++ b/runtime/command_queue/command_queue.h @@ -362,15 +362,15 @@ class CommandQueue : public BaseObject<_cl_command_queue> { return commandQueueProperties; } - bool isProfilingEnabled() { + bool isProfilingEnabled() const { return !!(this->getCommandQueueProperties() & CL_QUEUE_PROFILING_ENABLE); } - bool isOOQEnabled() { + bool isOOQEnabled() const { return !!(this->getCommandQueueProperties() & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE); } - bool isPerfCountersEnabled() { + bool isPerfCountersEnabled() const { return perfCountersEnabled; } @@ -388,7 +388,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> { this->isSpecialCommandQueue = newValue; } - uint16_t getPerfCountersUserRegistersNumber() { + uint16_t getPerfCountersUserRegistersNumber() const { return perfCountersUserRegistersNumber; } diff --git a/runtime/command_queue/enqueue_common.h b/runtime/command_queue/enqueue_common.h index 20c3578013..fda2e3bd1a 100644 --- a/runtime/command_queue/enqueue_common.h +++ b/runtime/command_queue/enqueue_common.h @@ -163,8 +163,7 @@ void CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, } bool profilingRequired = (this->isProfilingEnabled() && event != nullptr); - bool perfCountersRequired = false; - perfCountersRequired = (this->isPerfCountersEnabled() && event != nullptr); + bool perfCountersRequired = (this->isPerfCountersEnabled() && event != nullptr); KernelOperation *blockedCommandsData = nullptr; std::unique_ptr printfHandler; bool slmUsed = multiDispatchInfo.usesSlm() || parentKernel; @@ -381,7 +380,7 @@ void CommandQueueHw::processDispatchForKernels(const MultiDispatchInf KernelOperation *&blockedCommandsData, TimestampPacketContainer &previousTimestampPacketNodes, PreemptionMode preemption) { - HwPerfCounter *hwPerfCounter = nullptr; + TagNode *hwPerfCounter = nullptr; DebugManager.dumpKernelArgs(&multiDispatchInfo); printfHandler.reset(PrintfHandler::create(multiDispatchInfo, *device)); @@ -399,7 +398,7 @@ void CommandQueueHw::processDispatchForKernels(const MultiDispatchInf // Get allocation for timestamps hwTimeStamps = event->getHwTimeStampNode(); if (this->isPerfCountersEnabled()) { - hwPerfCounter = event->getHwPerfCounterNode()->tagForCpuAccess; + hwPerfCounter = event->getHwPerfCounterNode(); // PERF COUNTER: copy current configuration from queue to event event->copyPerfCounters(this->getPerfCountersConfigData()); } diff --git a/runtime/command_queue/gpgpu_walker.h b/runtime/command_queue/gpgpu_walker.h index c127154967..fe1f5fa867 100644 --- a/runtime/command_queue/gpgpu_walker.h +++ b/runtime/command_queue/gpgpu_walker.h @@ -114,15 +114,6 @@ inline cl_uint computeDimensions(const size_t workItems[3]) { template class GpgpuWalkerHelper { public: - using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL; - using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA; - - static void addAluReadModifyWriteRegister( - LinearStream *pCommandStream, - uint32_t aluRegister, - uint32_t operation, - uint32_t mask); - static void applyWADisableLSQCROPERFforOCL(LinearStream *pCommandStream, const Kernel &kernel, bool disablePerfMode); @@ -143,50 +134,21 @@ class GpgpuWalkerHelper { static void dispatchProfilingCommandsStart( TagNode &hwTimeStamps, - NEO::LinearStream *commandStream); + LinearStream *commandStream); static void dispatchProfilingCommandsEnd( TagNode &hwTimeStamps, - NEO::LinearStream *commandStream); - - static void dispatchPerfCountersNoopidRegisterCommands( - CommandQueue &commandQueue, - NEO::HwPerfCounter &hwPerfCounter, - NEO::LinearStream *commandStream, - bool start); - - static void dispatchPerfCountersReadFreqRegisterCommands( - CommandQueue &commandQueue, - NEO::HwPerfCounter &hwPerfCounter, - NEO::LinearStream *commandStream, - bool start); - - static void dispatchPerfCountersGeneralPurposeCounterCommands( - CommandQueue &commandQueue, - NEO::HwPerfCounter &hwPerfCounter, - NEO::LinearStream *commandStream, - bool start); - - static void dispatchPerfCountersUserCounterCommands( - CommandQueue &commandQueue, - NEO::HwPerfCounter &hwPerfCounter, - NEO::LinearStream *commandStream, - bool start); - - static void dispatchPerfCountersOABufferStateCommands( - CommandQueue &commandQueue, - NEO::HwPerfCounter &hwPerfCounter, - NEO::LinearStream *commandStream); + LinearStream *commandStream); static void dispatchPerfCountersCommandsStart( CommandQueue &commandQueue, - NEO::HwPerfCounter &hwPerfCounter, - NEO::LinearStream *commandStream); + TagNode &hwPerfCounter, + LinearStream *commandStream); static void dispatchPerfCountersCommandsEnd( CommandQueue &commandQueue, - NEO::HwPerfCounter &hwPerfCounter, - NEO::LinearStream *commandStream); + TagNode &hwPerfCounter, + LinearStream *commandStream); static void setupTimestampPacket( LinearStream *cmdStream, @@ -203,6 +165,36 @@ class GpgpuWalkerHelper { IndirectHeap *dsh); static void adjustMiStoreRegMemMode(MI_STORE_REG_MEM *storeCmd); + + private: + using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL; + + static void addAluReadModifyWriteRegister( + LinearStream *pCommandStream, + uint32_t aluRegister, + uint32_t operation, + uint32_t mask); + + static void dispatchStoreRegisterCommand( + LinearStream *commandStream, + uint64_t memoryAddress, + uint32_t registerAddress); + + static void dispatchPerfCountersGeneralPurposeCounterCommands( + CommandQueue &commandQueue, + TagNode &hwPerfCounter, + LinearStream *commandStream, + bool start); + + static void dispatchPerfCountersUserCounterCommands( + CommandQueue &commandQueue, + TagNode &hwPerfCounter, + LinearStream *commandStream, + bool start); + + static void dispatchPerfCountersOABufferStateCommands( + TagNode &hwPerfCounter, + LinearStream *commandStream); }; template diff --git a/runtime/command_queue/gpgpu_walker.inl b/runtime/command_queue/gpgpu_walker.inl index 99a15c2cf5..03291916e0 100644 --- a/runtime/command_queue/gpgpu_walker.inl +++ b/runtime/command_queue/gpgpu_walker.inl @@ -42,14 +42,14 @@ void GpgpuWalkerHelper::addAluReadModifyWriteRegister( typedef typename GfxFamily::MI_LOAD_REGISTER_REG MI_LOAD_REGISTER_REG; typedef typename GfxFamily::MI_MATH MI_MATH; typedef typename GfxFamily::MI_MATH_ALU_INST_INLINE MI_MATH_ALU_INST_INLINE; - auto pCmd = reinterpret_cast(pCommandStream->getSpace(sizeof(MI_LOAD_REGISTER_REG))); + auto pCmd = pCommandStream->getSpaceForCmd(); *pCmd = GfxFamily::cmdInitLoadRegisterReg; pCmd->setSourceRegisterAddress(aluRegister); pCmd->setDestinationRegisterAddress(CS_GPR_R0); // Load "Mask" into CS_GPR_R1 typedef typename GfxFamily::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM; - auto pCmd2 = reinterpret_cast(pCommandStream->getSpace(sizeof(MI_LOAD_REGISTER_IMM))); + auto pCmd2 = pCommandStream->getSpaceForCmd(); *pCmd2 = GfxFamily::cmdInitLoadRegisterImm; pCmd2->setRegisterOffset(CS_GPR_R1); pCmd2->setDataDword(mask); @@ -88,13 +88,13 @@ void GpgpuWalkerHelper::addAluReadModifyWriteRegister( pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_ACCU; // LOAD value of CS_GPR_R0 into "Register" - auto pCmd4 = reinterpret_cast(pCommandStream->getSpace(sizeof(MI_LOAD_REGISTER_REG))); + auto pCmd4 = pCommandStream->getSpaceForCmd(); *pCmd4 = GfxFamily::cmdInitLoadRegisterReg; pCmd4->setSourceRegisterAddress(CS_GPR_R0); pCmd4->setDestinationRegisterAddress(aluRegister); // Add PIPE_CONTROL to flush caches - auto pCmd5 = reinterpret_cast(pCommandStream->getSpace(sizeof(PIPE_CONTROL))); + auto pCmd5 = pCommandStream->getSpaceForCmd(); *pCmd5 = GfxFamily::cmdInitPipeControl; pCmd5->setCommandStreamerStallEnable(true); pCmd5->setDcFlushEnable(true); @@ -106,7 +106,8 @@ void GpgpuWalkerHelper::addAluReadModifyWriteRegister( template void GpgpuWalkerHelper::dispatchProfilingCommandsStart( TagNode &hwTimeStamps, - NEO::LinearStream *commandStream) { + LinearStream *commandStream) { + using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM; // PIPE_CONTROL for global timestamp @@ -118,7 +119,7 @@ void GpgpuWalkerHelper::dispatchProfilingCommandsStart( timeStampAddress = hwTimeStamps.getGpuAddress() + offsetof(HwTimeStamps, ContextStartTS); //low part - auto pMICmdLow = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM)); + auto pMICmdLow = commandStream->getSpaceForCmd(); *pMICmdLow = GfxFamily::cmdInitStoreRegisterMem; adjustMiStoreRegMemMode(pMICmdLow); pMICmdLow->setRegisterAddress(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW); @@ -128,12 +129,12 @@ void GpgpuWalkerHelper::dispatchProfilingCommandsStart( template void GpgpuWalkerHelper::dispatchProfilingCommandsEnd( TagNode &hwTimeStamps, - NEO::LinearStream *commandStream) { + LinearStream *commandStream) { using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM; // PIPE_CONTROL for global timestamp - auto pPipeControlCmd = (PIPE_CONTROL *)commandStream->getSpace(sizeof(PIPE_CONTROL)); + auto pPipeControlCmd = commandStream->getSpaceForCmd(); *pPipeControlCmd = GfxFamily::cmdInitPipeControl; pPipeControlCmd->setCommandStreamerStallEnable(true); @@ -141,7 +142,7 @@ void GpgpuWalkerHelper::dispatchProfilingCommandsEnd( uint64_t timeStampAddress = hwTimeStamps.getGpuAddress() + offsetof(HwTimeStamps, ContextEndTS); //low part - auto pMICmdLow = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM)); + auto pMICmdLow = commandStream->getSpaceForCmd(); *pMICmdLow = GfxFamily::cmdInitStoreRegisterMem; adjustMiStoreRegMemMode(pMICmdLow); pMICmdLow->setRegisterAddress(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW); @@ -149,144 +150,79 @@ void GpgpuWalkerHelper::dispatchProfilingCommandsEnd( } template -void GpgpuWalkerHelper::dispatchPerfCountersNoopidRegisterCommands( - CommandQueue &commandQueue, - NEO::HwPerfCounter &hwPerfCounter, - NEO::LinearStream *commandStream, - bool start) { +void GpgpuWalkerHelper::dispatchStoreRegisterCommand( + LinearStream *commandStream, + uint64_t memoryAddress, + uint32_t registerAddress) { using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM; - uint64_t address = start ? reinterpret_cast(&(hwPerfCounter.HWPerfCounters.DMAFenceIdBegin)) - : reinterpret_cast(&(hwPerfCounter.HWPerfCounters.DMAFenceIdEnd)); - - auto pNoopIdRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM)); - *pNoopIdRegister = GfxFamily::cmdInitStoreRegisterMem; - pNoopIdRegister->setRegisterAddress(NEO::INSTR_MMIO_NOOPID); - pNoopIdRegister->setMemoryAddress(address); -} - -template -void GpgpuWalkerHelper::dispatchPerfCountersReadFreqRegisterCommands( - CommandQueue &commandQueue, - NEO::HwPerfCounter &hwPerfCounter, - NEO::LinearStream *commandStream, - bool start) { - - using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM; - - uint64_t address = start ? reinterpret_cast(&(hwPerfCounter.HWPerfCounters.CoreFreqBegin)) - : reinterpret_cast(&(hwPerfCounter.HWPerfCounters.CoreFreqEnd)); - - auto pCoreFreqRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM)); - *pCoreFreqRegister = GfxFamily::cmdInitStoreRegisterMem; - pCoreFreqRegister->setRegisterAddress(NEO::INSTR_MMIO_RPSTAT1); - pCoreFreqRegister->setMemoryAddress(address); + auto pCmd = commandStream->getSpaceForCmd(); + *pCmd = GfxFamily::cmdInitStoreRegisterMem; + pCmd->setRegisterAddress(registerAddress); + pCmd->setMemoryAddress(memoryAddress); } template void GpgpuWalkerHelper::dispatchPerfCountersGeneralPurposeCounterCommands( CommandQueue &commandQueue, - NEO::HwPerfCounter &hwPerfCounter, - NEO::LinearStream *commandStream, + TagNode &hwPerfCounter, + LinearStream *commandStream, bool start) { - using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM; - uint64_t address = 0; - const uint64_t baseAddress = start ? reinterpret_cast(&(hwPerfCounter.HWPerfCounters.HwPerfReportBegin.Gp)) - : reinterpret_cast(&(hwPerfCounter.HWPerfCounters.HwPerfReportEnd.Gp)); + uint64_t baseAddress = hwPerfCounter.getGpuAddress(); + baseAddress += start ? offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportBegin.Gp) + : offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportEnd.Gp); // Read General Purpose counters - for (uint16_t i = 0; i < NEO::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT; i++) { - auto pGeneralPurposeRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM)); - *pGeneralPurposeRegister = GfxFamily::cmdInitStoreRegisterMem; + for (auto i = 0u; i < NEO::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT; i++) { uint32_t regAddr = INSTR_GFX_OFFSETS::INSTR_PERF_CNT_1_DW0 + i * sizeof(cl_uint); - pGeneralPurposeRegister->setRegisterAddress(regAddr); //Gp field is 2*uint64 wide so it can hold 4 uint32 - address = baseAddress + i * sizeof(cl_uint); - pGeneralPurposeRegister->setMemoryAddress(address); + uint64_t address = baseAddress + i * sizeof(cl_uint); + dispatchStoreRegisterCommand(commandStream, address, regAddr); } } template void GpgpuWalkerHelper::dispatchPerfCountersUserCounterCommands( CommandQueue &commandQueue, - NEO::HwPerfCounter &hwPerfCounter, - NEO::LinearStream *commandStream, + TagNode &hwPerfCounter, + LinearStream *commandStream, bool start) { - using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM; - - uint64_t address = 0; - const uint64_t baseAddr = start ? reinterpret_cast(&(hwPerfCounter.HWPerfCounters.HwPerfReportBegin.User)) - : reinterpret_cast(&(hwPerfCounter.HWPerfCounters.HwPerfReportEnd.User)); - uint32_t cmdNum = 0; - uint32_t regAddr = 0; - auto configData = commandQueue.getPerfCountersConfigData(); - auto userRegs = &configData->ReadRegs; + uint64_t baseAddr = hwPerfCounter.getGpuAddress(); + baseAddr += start ? offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportBegin.User) + : offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportEnd.User); + auto userRegs = &commandQueue.getPerfCountersConfigData()->ReadRegs; for (uint32_t i = 0; i < userRegs->RegsCount; i++) { - auto pRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM)); - *pRegister = GfxFamily::cmdInitStoreRegisterMem; - - regAddr = userRegs->Reg[i].Offset; - pRegister->setRegisterAddress(regAddr); + uint32_t regAddr = userRegs->Reg[i].Offset; //offset between base (low) registers is cl_ulong wide - address = baseAddr + i * sizeof(cl_ulong); - pRegister->setMemoryAddress(address); - cmdNum++; + uint64_t address = baseAddr + i * sizeof(cl_ulong); + dispatchStoreRegisterCommand(commandStream, address, regAddr); if (userRegs->Reg[i].BitSize > 32) { - pRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM)); - *pRegister = GfxFamily::cmdInitStoreRegisterMem; - - regAddr += sizeof(cl_uint); - pRegister->setRegisterAddress(regAddr); - address += sizeof(cl_uint); - pRegister->setMemoryAddress(address); - cmdNum++; + dispatchStoreRegisterCommand(commandStream, address + sizeof(cl_uint), regAddr + sizeof(cl_uint)); } } } template void GpgpuWalkerHelper::dispatchPerfCountersOABufferStateCommands( - CommandQueue &commandQueue, - NEO::HwPerfCounter &hwPerfCounter, - NEO::LinearStream *commandStream) { + TagNode &hwPerfCounter, + LinearStream *commandStream) { - using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM; - - uint64_t address = 0; - //OA Status - auto pOaRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM)); - *pOaRegister = GfxFamily::cmdInitStoreRegisterMem; - pOaRegister->setRegisterAddress(INSTR_GFX_OFFSETS::INSTR_OA_STATUS); - address = reinterpret_cast(&(hwPerfCounter.HWPerfCounters.OaStatus)); - pOaRegister->setMemoryAddress(address); - - //OA Head - pOaRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM)); - *pOaRegister = GfxFamily::cmdInitStoreRegisterMem; - pOaRegister->setRegisterAddress(INSTR_GFX_OFFSETS::INSTR_OA_HEAD_PTR); - address = reinterpret_cast(&(hwPerfCounter.HWPerfCounters.OaHead)); - pOaRegister->setMemoryAddress(address); - - //OA Tail - pOaRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM)); - *pOaRegister = GfxFamily::cmdInitStoreRegisterMem; - pOaRegister->setRegisterAddress(INSTR_GFX_OFFSETS::INSTR_OA_TAIL_PTR); - address = reinterpret_cast(&(hwPerfCounter.HWPerfCounters.OaTail)); - pOaRegister->setMemoryAddress(address); + dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.OaStatus), INSTR_GFX_OFFSETS::INSTR_OA_STATUS); + dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.OaHead), INSTR_GFX_OFFSETS::INSTR_OA_HEAD_PTR); + dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.OaTail), INSTR_GFX_OFFSETS::INSTR_OA_TAIL_PTR); } template void GpgpuWalkerHelper::dispatchPerfCountersCommandsStart( CommandQueue &commandQueue, - NEO::HwPerfCounter &hwPerfCounter, - NEO::LinearStream *commandStream) { + TagNode &hwPerfCounter, + LinearStream *commandStream) { - using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM; using MI_REPORT_PERF_COUNT = typename GfxFamily::MI_REPORT_PERF_COUNT; auto perfCounters = commandQueue.getPerfCounters(); @@ -294,25 +230,25 @@ void GpgpuWalkerHelper::dispatchPerfCountersCommandsStart( uint32_t currentReportId = perfCounters->getCurrentReportId(); uint64_t address = 0; //flush command streamer - auto pPipeControlCmd = (PIPE_CONTROL *)commandStream->getSpace(sizeof(PIPE_CONTROL)); + auto pPipeControlCmd = commandStream->getSpaceForCmd(); *pPipeControlCmd = GfxFamily::cmdInitPipeControl; pPipeControlCmd->setCommandStreamerStallEnable(true); //Store value of NOOPID register - GpgpuWalkerHelper::dispatchPerfCountersNoopidRegisterCommands(commandQueue, hwPerfCounter, commandStream, true); + GpgpuWalkerHelper::dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.DMAFenceIdBegin), INSTR_MMIO_NOOPID); //Read Core Frequency - GpgpuWalkerHelper::dispatchPerfCountersReadFreqRegisterCommands(commandQueue, hwPerfCounter, commandStream, true); + GpgpuWalkerHelper::dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.CoreFreqBegin), INSTR_MMIO_RPSTAT1); GpgpuWalkerHelper::dispatchPerfCountersGeneralPurposeCounterCommands(commandQueue, hwPerfCounter, commandStream, true); - auto pReportPerfCount = (MI_REPORT_PERF_COUNT *)commandStream->getSpace(sizeof(MI_REPORT_PERF_COUNT)); + auto pReportPerfCount = commandStream->getSpaceForCmd(); *pReportPerfCount = GfxFamily::cmdInitReportPerfCount; pReportPerfCount->setReportId(currentReportId); - address = reinterpret_cast(&(hwPerfCounter.HWPerfCounters.HwPerfReportBegin.Oa)); + address = hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportBegin.Oa); pReportPerfCount->setMemoryAddress(address); - address = reinterpret_cast(&(hwPerfCounter.HWTimeStamp.GlobalStartTS)); + address = hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWTimeStamp.GlobalStartTS); PipeControlHelper::obtainPipeControlAndProgramPostSyncOperation(commandStream, PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP, address, 0llu, false); @@ -324,41 +260,39 @@ void GpgpuWalkerHelper::dispatchPerfCountersCommandsStart( template void GpgpuWalkerHelper::dispatchPerfCountersCommandsEnd( CommandQueue &commandQueue, - NEO::HwPerfCounter &hwPerfCounter, - NEO::LinearStream *commandStream) { + TagNode &hwPerfCounter, + LinearStream *commandStream) { - using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM; using MI_REPORT_PERF_COUNT = typename GfxFamily::MI_REPORT_PERF_COUNT; auto perfCounters = commandQueue.getPerfCounters(); uint32_t currentReportId = perfCounters->getCurrentReportId(); - uint64_t address = 0; //flush command streamer - auto pPipeControlCmd = (PIPE_CONTROL *)commandStream->getSpace(sizeof(PIPE_CONTROL)); + auto pPipeControlCmd = commandStream->getSpaceForCmd(); *pPipeControlCmd = GfxFamily::cmdInitPipeControl; pPipeControlCmd->setCommandStreamerStallEnable(true); - GpgpuWalkerHelper::dispatchPerfCountersOABufferStateCommands(commandQueue, hwPerfCounter, commandStream); + GpgpuWalkerHelper::dispatchPerfCountersOABufferStateCommands(hwPerfCounter, commandStream); //Timestamp: Global End - address = reinterpret_cast(&(hwPerfCounter.HWTimeStamp.GlobalEndTS)); + uint64_t address = hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWTimeStamp.GlobalEndTS); PipeControlHelper::obtainPipeControlAndProgramPostSyncOperation(commandStream, PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP, address, 0llu, false); - auto pReportPerfCount = (MI_REPORT_PERF_COUNT *)commandStream->getSpace(sizeof(MI_REPORT_PERF_COUNT)); + auto pReportPerfCount = commandStream->getSpaceForCmd(); *pReportPerfCount = GfxFamily::cmdInitReportPerfCount; pReportPerfCount->setReportId(currentReportId); - address = reinterpret_cast(&(hwPerfCounter.HWPerfCounters.HwPerfReportEnd.Oa)); + address = hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportEnd.Oa); pReportPerfCount->setMemoryAddress(address); GpgpuWalkerHelper::dispatchPerfCountersGeneralPurposeCounterCommands(commandQueue, hwPerfCounter, commandStream, false); //Store value of NOOPID register - GpgpuWalkerHelper::dispatchPerfCountersNoopidRegisterCommands(commandQueue, hwPerfCounter, commandStream, false); + GpgpuWalkerHelper::dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.DMAFenceIdEnd), INSTR_MMIO_NOOPID); //Read Core Frequency - GpgpuWalkerHelper::dispatchPerfCountersReadFreqRegisterCommands(commandQueue, hwPerfCounter, commandStream, false); + GpgpuWalkerHelper::dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.CoreFreqEnd), INSTR_MMIO_RPSTAT1); GpgpuWalkerHelper::dispatchPerfCountersUserCounterCommands(commandQueue, hwPerfCounter, commandStream, false); diff --git a/runtime/command_queue/hardware_interface.h b/runtime/command_queue/hardware_interface.h index cfb4e92115..ce7d8aa316 100644 --- a/runtime/command_queue/hardware_interface.h +++ b/runtime/command_queue/hardware_interface.h @@ -41,7 +41,7 @@ class HardwareInterface { const CsrDependencies &csrDependencies, KernelOperation **blockedCommandsData, TagNode *hwTimeStamps, - HwPerfCounter *hwPerfCounter, + TagNode *hwPerfCounter, TimestampPacketContainer *previousTimestampPacketNodes, TimestampPacketContainer *currentTimestampPacketNodes, PreemptionMode preemptionMode, @@ -67,13 +67,13 @@ class HardwareInterface { const DispatchInfo &dispatchInfo, const MultiDispatchInfo &multiDispatchInfo, TagNode *hwTimeStamps, - HwPerfCounter *hwPerfCounter, + TagNode *hwPerfCounter, LinearStream *commandStream, CommandQueue &commandQueue); static void dispatchProfilingPerfEndCommands( TagNode *hwTimeStamps, - HwPerfCounter *hwPerfCounter, + TagNode *hwPerfCounter, LinearStream *commandStream, CommandQueue &commandQueue); diff --git a/runtime/command_queue/hardware_interface.inl b/runtime/command_queue/hardware_interface.inl index 0c6ea95d73..c68df52aae 100644 --- a/runtime/command_queue/hardware_interface.inl +++ b/runtime/command_queue/hardware_interface.inl @@ -28,7 +28,7 @@ void HardwareInterface::dispatchWalker( const CsrDependencies &csrDependencies, KernelOperation **blockedCommandsData, TagNode *hwTimeStamps, - HwPerfCounter *hwPerfCounter, + TagNode *hwPerfCounter, TimestampPacketContainer *previousTimestampPacketNodes, TimestampPacketContainer *currentTimestampPacketNodes, PreemptionMode preemptionMode, diff --git a/runtime/command_queue/hardware_interface_base.inl b/runtime/command_queue/hardware_interface_base.inl index c92c7e5743..c80fe9e736 100644 --- a/runtime/command_queue/hardware_interface_base.inl +++ b/runtime/command_queue/hardware_interface_base.inl @@ -53,7 +53,7 @@ inline void HardwareInterface::dispatchProfilingPerfStartCommands( const DispatchInfo &dispatchInfo, const MultiDispatchInfo &multiDispatchInfo, TagNode *hwTimeStamps, - HwPerfCounter *hwPerfCounter, + TagNode *hwPerfCounter, LinearStream *commandStream, CommandQueue &commandQueue) { @@ -71,7 +71,7 @@ inline void HardwareInterface::dispatchProfilingPerfStartCommands( template inline void HardwareInterface::dispatchProfilingPerfEndCommands( TagNode *hwTimeStamps, - HwPerfCounter *hwPerfCounter, + TagNode *hwPerfCounter, LinearStream *commandStream, CommandQueue &commandQueue) { diff --git a/runtime/device_queue/device_queue_hw.inl b/runtime/device_queue/device_queue_hw.inl index e1c28c05b6..8f80d06cac 100644 --- a/runtime/device_queue/device_queue_hw.inl +++ b/runtime/device_queue/device_queue_hw.inl @@ -213,7 +213,6 @@ void DeviceQueueHw::addExecutionModelCleanUpSection(Kernel *parentKer igilQueue->m_controls.m_CleanupSectionAddress = ptrOffset(slbBuffer->getGpuAddress(), slbCS.getUsed()); GpgpuWalkerHelper::applyWADisableLSQCROPERFforOCL(&slbCS, *parentKernel, true); - using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM; using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL; if (hwTimeStamp != nullptr) { diff --git a/unit_tests/event/event_tests.cpp b/unit_tests/event/event_tests.cpp index fe959b2d4a..58dd391fe3 100644 --- a/unit_tests/event/event_tests.cpp +++ b/unit_tests/event/event_tests.cpp @@ -1194,13 +1194,8 @@ TEST_F(EventTest, hwPerfCounterMemoryIsPlacedInGraphicsAllocation) { void *memoryStorage = allocation->getUnderlyingBuffer(); size_t graphicsAllocationSize = allocation->getUnderlyingBufferSize(); - uintptr_t perfCounterAddress = reinterpret_cast(perfCounter); - uintptr_t graphicsAllocationStart = reinterpret_cast(memoryStorage); - - if (!((perfCounterAddress >= graphicsAllocationStart) && - ((perfCounterAddress + sizeof(HwPerfCounter)) <= (graphicsAllocationStart + graphicsAllocationSize)))) { - EXPECT_TRUE(false); - } + EXPECT_GE(perfCounter, memoryStorage); + EXPECT_LE(perfCounter + 1, ptrOffset(memoryStorage, graphicsAllocationSize)); } TEST_F(EventTest, IsPerfCounter_DisabledByNullQueue) { diff --git a/unit_tests/libult/ult_command_stream_receiver.h b/unit_tests/libult/ult_command_stream_receiver.h index 7b7a5de8f7..d7f0bfeb1b 100644 --- a/unit_tests/libult/ult_command_stream_receiver.h +++ b/unit_tests/libult/ult_command_stream_receiver.h @@ -51,6 +51,8 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw, publ using BaseClass::CommandStreamReceiver::latestSentStatelessMocsConfig; using BaseClass::CommandStreamReceiver::latestSentTaskCount; using BaseClass::CommandStreamReceiver::mediaVfeStateDirty; + using BaseClass::CommandStreamReceiver::perfCounterAllocator; + using BaseClass::CommandStreamReceiver::profilingTimeStampAllocator; using BaseClass::CommandStreamReceiver::requiredScratchSize; using BaseClass::CommandStreamReceiver::requiredThreadArbitrationPolicy; using BaseClass::CommandStreamReceiver::samplerCacheFlushRequired; diff --git a/unit_tests/profiling/profiling_tests.cpp b/unit_tests/profiling/profiling_tests.cpp index 4a61798ea1..bf472debf0 100644 --- a/unit_tests/profiling/profiling_tests.cpp +++ b/unit_tests/profiling/profiling_tests.cpp @@ -422,7 +422,9 @@ class MyOSTime : public OSTime { return 0; } }; + int MyOSTime::instanceNum = 0; + TEST(EventProfilingTest, givenEventWhenCompleteIsZeroThenCalcProfilingDataSetsEndTimestampInCompleteTimestampAndDoesntCallOsTimeMethods) { std::unique_ptr device(MockDevice::createWithNewExecutionEnvironment(nullptr)); MyOSTime::instanceNum = 0; @@ -523,7 +525,21 @@ struct ProfilingWithPerfCountersTests : public ProfilingTests, ProfilingTests::TearDown(); PerformanceCountersFixture::TearDown(); } + + template + GenCmdList::iterator expectStoreRegister(GenCmdList::iterator itor, uint64_t memoryAddress, uint32_t registerAddress) { + using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM; + + itor = find(itor, cmdList.end()); + EXPECT_NE(cmdList.end(), itor); + auto pStore = genCmdCast(*itor); + EXPECT_EQ(memoryAddress, pStore->getMemoryAddress()); + EXPECT_EQ(registerAddress, pStore->getRegisterAddress()); + itor++; + return itor; + } }; + HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingWithPerfCountersTests, GIVENCommandQueueWithProfilingPerfCounterAndForWorkloadWithKernelWHENGetCSFromCmdQueueTHENEnoughSpaceInCS) { typedef typename FamilyType::MI_STORE_REGISTER_MEM MI_STORE_REGISTER_MEM; typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL; @@ -859,6 +875,88 @@ HWTEST_F(ProfilingWithPerfCountersTests, pCmdQ->setPerfCountersEnabled(false, UINT32_MAX); } +template +struct FixedGpuAddressTagAllocator : TagAllocator { + + struct MockTagNode : TagNode { + void setGpuAddress(uint64_t value) { this->gpuAddress = value; } + }; + + FixedGpuAddressTagAllocator(CommandStreamReceiver &csr, uint64_t gpuAddress) + : TagAllocator(csr.getMemoryManager(), csr.getPreferredTagPoolSize(), MemoryConstants::cacheLineSize) { + auto tag = reinterpret_cast(this->freeTags.peekHead()); + tag->setGpuAddress(gpuAddress); + } +}; + +HWTEST_F(ProfilingWithPerfCountersTests, GIVENCommandQueueWithProfilingPerfCountersWHENWalkerIsDispatchedTHENRegisterStoresArePresentInCS) { + uint64_t timeStampGpuAddress = 0x123456000; + uint64_t perfCountersGpuAddress = 0xabcdef000; + + auto &csr = pDevice->getUltCommandStreamReceiver(); + csr.profilingTimeStampAllocator.reset(new FixedGpuAddressTagAllocator(csr, timeStampGpuAddress)); + csr.perfCounterAllocator.reset(new FixedGpuAddressTagAllocator(csr, perfCountersGpuAddress)); + + pCmdQ->setPerfCountersEnabled(true, 1); + + MockKernel kernel(program.get(), kernelInfo, *pDevice); + ASSERT_EQ(CL_SUCCESS, kernel.initialize()); + + size_t globalOffsets[3] = {0, 0, 0}; + size_t workItems[3] = {1, 1, 1}; + uint32_t dimensions = 1; + cl_event event; + cl_kernel clKernel = &kernel; + + static_cast *>(pCmdQ)->enqueueKernel( + clKernel, + dimensions, + globalOffsets, + workItems, + nullptr, + 0, + nullptr, + &event); + + auto pEvent = static_cast *>(event); + EXPECT_EQ(pEvent->getHwTimeStampNode()->getGpuAddress(), timeStampGpuAddress); + EXPECT_EQ(pEvent->getHwPerfCounterNode()->getGpuAddress(), perfCountersGpuAddress); + parseCommands(*pCmdQ); + + auto itor = expectStoreRegister(cmdList.begin(), timeStampGpuAddress + offsetof(HwTimeStamps, ContextStartTS), GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW); + itor = expectStoreRegister(itor, perfCountersGpuAddress + offsetof(HwPerfCounter, HWPerfCounters.DMAFenceIdBegin), INSTR_MMIO_NOOPID); + itor = expectStoreRegister(itor, perfCountersGpuAddress + offsetof(HwPerfCounter, HWPerfCounters.CoreFreqBegin), INSTR_MMIO_RPSTAT1); + for (auto i = 0u; i < NEO::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT; i++) { + itor = expectStoreRegister(itor, perfCountersGpuAddress + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportBegin.Gp) + i * sizeof(cl_uint), + INSTR_GFX_OFFSETS::INSTR_PERF_CNT_1_DW0 + i * sizeof(cl_uint)); + } + itor = expectStoreRegister(itor, perfCountersGpuAddress + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportBegin.User), 0); + itor = expectStoreRegister(itor, perfCountersGpuAddress + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportBegin.User) + 8, 0); + itor = expectStoreRegister(itor, perfCountersGpuAddress + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportBegin.User) + 12, 4); + + // after WALKER: + + itor = expectStoreRegister(itor, timeStampGpuAddress + offsetof(HwTimeStamps, ContextEndTS), GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW); + itor = expectStoreRegister(itor, perfCountersGpuAddress + offsetof(HwPerfCounter, HWPerfCounters.OaStatus), INSTR_GFX_OFFSETS::INSTR_OA_STATUS); + itor = expectStoreRegister(itor, perfCountersGpuAddress + offsetof(HwPerfCounter, HWPerfCounters.OaHead), INSTR_GFX_OFFSETS::INSTR_OA_HEAD_PTR); + itor = expectStoreRegister(itor, perfCountersGpuAddress + offsetof(HwPerfCounter, HWPerfCounters.OaTail), INSTR_GFX_OFFSETS::INSTR_OA_TAIL_PTR); + for (auto i = 0u; i < NEO::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT; i++) { + itor = expectStoreRegister(itor, perfCountersGpuAddress + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportEnd.Gp) + i * sizeof(cl_uint), + INSTR_GFX_OFFSETS::INSTR_PERF_CNT_1_DW0 + i * sizeof(cl_uint)); + } + itor = expectStoreRegister(itor, perfCountersGpuAddress + offsetof(HwPerfCounter, HWPerfCounters.DMAFenceIdEnd), INSTR_MMIO_NOOPID); + itor = expectStoreRegister(itor, perfCountersGpuAddress + offsetof(HwPerfCounter, HWPerfCounters.CoreFreqEnd), INSTR_MMIO_RPSTAT1); + itor = expectStoreRegister(itor, perfCountersGpuAddress + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportEnd.User), 0); + itor = expectStoreRegister(itor, perfCountersGpuAddress + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportEnd.User) + 8, 0); + itor = expectStoreRegister(itor, perfCountersGpuAddress + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportEnd.User) + 12, 4); + + EXPECT_TRUE(pEvent->calcProfilingData()); + + clReleaseEvent(event); + + pCmdQ->setPerfCountersEnabled(false, UINT32_MAX); +} + struct MockTimestampContainer : public TimestampPacketContainer { ~MockTimestampContainer() override { for (const auto &node : timestampPacketNodes) {