/* * Copyright (C) 2017-2019 Intel Corporation * * SPDX-License-Identifier: MIT * */ #include "core/unit_tests/helpers/debug_manager_state_restore.h" #include "core/unit_tests/utilities/base_object_utils.h" #include "runtime/command_queue/command_queue_hw.h" #include "runtime/command_queue/enqueue_common.h" #include "runtime/command_queue/enqueue_kernel.h" #include "runtime/command_queue/enqueue_marker.h" #include "runtime/command_queue/enqueue_migrate_mem_objects.h" #include "runtime/helpers/dispatch_info.h" #include "runtime/memory_manager/surface.h" #include "runtime/os_interface/os_interface.h" #include "runtime/utilities/tag_allocator.h" #include "test.h" #include "unit_tests/command_queue/command_enqueue_fixture.h" #include "unit_tests/event/event_fixture.h" #include "unit_tests/fixtures/device_fixture.h" #include "unit_tests/mocks/mock_command_queue.h" #include "unit_tests/mocks/mock_context.h" #include "unit_tests/mocks/mock_event.h" #include "unit_tests/mocks/mock_kernel.h" #include "unit_tests/mocks/mock_program.h" #include "unit_tests/os_interface/mock_performance_counters.h" namespace NEO { struct ProfilingTests : public CommandEnqueueFixture, public ::testing::Test { void SetUp() override { CommandEnqueueFixture::SetUp(CL_QUEUE_PROFILING_ENABLE); program = ReleaseableObjectPtr(new MockProgram(*pDevice->getExecutionEnvironment())); program->setContext(&ctx); memset(&kernelHeader, 0, sizeof(kernelHeader)); kernelHeader.KernelHeapSize = sizeof(kernelIsa); memset(&dataParameterStream, 0, sizeof(dataParameterStream)); dataParameterStream.DataParameterStreamSize = sizeof(crossThreadData); executionEnvironment = {}; memset(&executionEnvironment, 0, sizeof(executionEnvironment)); executionEnvironment.CompiledSIMD32 = 1; executionEnvironment.LargestCompiledSIMDSize = 32; memset(&threadPayload, 0, sizeof(threadPayload)); threadPayload.LocalIDXPresent = 1; threadPayload.LocalIDYPresent = 1; threadPayload.LocalIDZPresent = 1; kernelInfo.heapInfo.pKernelHeap = kernelIsa; kernelInfo.heapInfo.pKernelHeader = &kernelHeader; kernelInfo.patchInfo.dataParameterStream = &dataParameterStream; kernelInfo.patchInfo.executionEnvironment = &executionEnvironment; kernelInfo.patchInfo.threadPayload = &threadPayload; } void TearDown() override { CommandEnqueueFixture::TearDown(); } ReleaseableObjectPtr program; SKernelBinaryHeaderCommon kernelHeader = {}; SPatchDataParameterStream dataParameterStream = {}; SPatchExecutionEnvironment executionEnvironment = {}; SPatchThreadPayload threadPayload = {}; KernelInfo kernelInfo; MockContext ctx; uint32_t kernelIsa[32]; uint32_t crossThreadData[32]; }; HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GIVENCommandQueueWithProfilingAndForWorkloadWithKernelWHENGetCSFromCmdQueueTHENEnoughSpaceInCS) { typedef typename FamilyType::MI_STORE_REGISTER_MEM MI_STORE_REGISTER_MEM; typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL; typedef typename FamilyType::GPGPU_WALKER GPGPU_WALKER; MockKernel kernel(program.get(), kernelInfo, *pDevice); uint64_t requiredSize = 2 * sizeof(PIPE_CONTROL) + 2 * sizeof(MI_STORE_REGISTER_MEM) + sizeof(GPGPU_WALKER) + HardwareCommandsHelper::getSizeRequiredCS(&kernel); MultiDispatchInfo multiDispatchInfo(&kernel); auto &commandStreamNDRangeKernel = getCommandStream(*pCmdQ, CsrDependencies(), true, false, false, multiDispatchInfo, nullptr, 0); auto expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, true, false, *pCmdQ, &kernel); EXPECT_GE(expectedSizeCS, requiredSize); EXPECT_GE(commandStreamNDRangeKernel.getAvailableSpace(), requiredSize); auto &commandStreamTask = getCommandStream(*pCmdQ, CsrDependencies(), true, false, false, multiDispatchInfo, nullptr, 0); expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_TASK, true, false, *pCmdQ, &kernel); EXPECT_GE(expectedSizeCS, requiredSize); EXPECT_GE(commandStreamTask.getAvailableSpace(), requiredSize); } HWTEST_F(ProfilingTests, GIVENCommandQueueWithProfilingAndForWorkloadWithNoKernelWHENGetCSFromCmdQueueTHENEnoughSpaceInCS) { typedef typename FamilyType::MI_STORE_REGISTER_MEM MI_STORE_REGISTER_MEM; typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL; typedef typename FamilyType::WALKER_TYPE GPGPU_WALKER; uint64_t requiredSize = 2 * sizeof(PIPE_CONTROL) + 4 * sizeof(MI_STORE_REGISTER_MEM); MultiDispatchInfo multiDispatchInfo(nullptr); auto &commandStreamMigrateMemObjects = getCommandStream(*pCmdQ, CsrDependencies(), true, false, false, multiDispatchInfo, nullptr, 0); auto expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_MIGRATE_MEM_OBJECTS, true, false, *pCmdQ, nullptr); EXPECT_GE(expectedSizeCS, requiredSize); EXPECT_GE(commandStreamMigrateMemObjects.getAvailableSpace(), requiredSize); auto &commandStreamMarker = getCommandStream(*pCmdQ, CsrDependencies(), true, false, false, multiDispatchInfo, nullptr, 0); expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_MARKER, true, false, *pCmdQ, nullptr); EXPECT_GE(expectedSizeCS, requiredSize); EXPECT_GE(commandStreamMarker.getAvailableSpace(), requiredSize); } HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GIVENCommandQueueWithProfilingAndForWorkloadWithTwoKernelsInMdiWHENGetCSFromCmdQueueTHENEnoughSpaceInCS) { typedef typename FamilyType::MI_STORE_REGISTER_MEM MI_STORE_REGISTER_MEM; typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL; typedef typename FamilyType::GPGPU_WALKER GPGPU_WALKER; MockKernel kernel(program.get(), kernelInfo, *pDevice); uint64_t requiredSize = 2 * sizeof(PIPE_CONTROL) + 4 * sizeof(MI_STORE_REGISTER_MEM) + HardwareCommandsHelper::getSizeRequiredCS(&kernel); requiredSize += 2 * sizeof(GPGPU_WALKER); DispatchInfo dispatchInfo; dispatchInfo.setKernel(&kernel); MultiDispatchInfo multiDispatchInfo; multiDispatchInfo.push(dispatchInfo); multiDispatchInfo.push(dispatchInfo); auto &commandStreamTask = getCommandStream(*pCmdQ, CsrDependencies(), true, false, false, multiDispatchInfo, nullptr, 0); auto expectedSizeCS = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_TASK, CsrDependencies(), true, false, false, *pCmdQ, multiDispatchInfo); EXPECT_GE(expectedSizeCS, requiredSize); EXPECT_GE(commandStreamTask.getAvailableSpace(), requiredSize); } /* # Two additional PIPE_CONTROLs are expected before first MI_STORE_REGISTER_MEM (which is before GPGPU_WALKER) # and after second MI_STORE_REGISTER_MEM (which is after GPGPU_WALKER). */ HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GIVENCommandQueueWithProfolingWHENWalkerIsDispatchedTHENPipeControlWithTimeStampIsPresentInCS) { typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL; typedef typename FamilyType::GPGPU_WALKER GPGPU_WALKER; MockKernel kernel(program.get(), kernelInfo, *pDevice); ASSERT_EQ(CL_SUCCESS, kernel.initialize()); size_t globalOffsets[3] = {0, 0, 0}; size_t workItems[3] = {1, 1, 1}; uint32_t dimensions = 1; cl_event event; cl_kernel clKernel = &kernel; static_cast *>(pCmdQ)->enqueueKernel( clKernel, dimensions, globalOffsets, workItems, nullptr, 0, nullptr, &event); parseCommands(*pCmdQ); // Find GPGPU_WALKER auto itorGPGPUWalkerCmd = find(cmdList.begin(), cmdList.end()); GenCmdList::reverse_iterator rItorGPGPUWalkerCmd(itorGPGPUWalkerCmd); ASSERT_NE(cmdList.end(), itorGPGPUWalkerCmd); // Check PIPE_CONTROLs auto itorBeforePC = reverse_find(rItorGPGPUWalkerCmd, cmdList.rbegin()); ASSERT_NE(cmdList.rbegin(), itorBeforePC); auto pBeforePC = genCmdCast(*itorBeforePC); ASSERT_NE(nullptr, pBeforePC); EXPECT_EQ(1u, pBeforePC->getCommandStreamerStallEnable()); auto itorAfterPC = find(itorGPGPUWalkerCmd, cmdList.end()); ASSERT_NE(cmdList.end(), itorAfterPC); auto pAfterPC = genCmdCast(*itorAfterPC); ASSERT_NE(nullptr, pAfterPC); EXPECT_EQ(1u, pAfterPC->getCommandStreamerStallEnable()); EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP, pBeforePC->getPostSyncOperation()); EXPECT_TRUE(static_cast *>(event)->calcProfilingData()); clReleaseEvent(event); } /* # One additional MI_STORE_REGISTER_MEM is expected before and after GPGPU_WALKER. */ HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GIVENCommandQueueWithProflingWHENWalkerIsDispatchedTHENMiStoreRegisterMemIsPresentInCS) { typedef typename FamilyType::MI_STORE_REGISTER_MEM MI_STORE_REGISTER_MEM; typedef typename FamilyType::GPGPU_WALKER GPGPU_WALKER; MockKernel kernel(program.get(), kernelInfo, *pDevice); ASSERT_EQ(CL_SUCCESS, kernel.initialize()); size_t globalOffsets[3] = {0, 0, 0}; size_t workItems[3] = {1, 1, 1}; uint32_t dimensions = 1; cl_event event; static_cast *>(pCmdQ)->enqueueKernel( &kernel, dimensions, globalOffsets, workItems, nullptr, 0, nullptr, &event); parseCommands(*pCmdQ); // Find GPGPU_WALKER auto itorGPGPUWalkerCmd = find(cmdList.begin(), cmdList.end()); GenCmdList::reverse_iterator rItorGPGPUWalkerCmd(itorGPGPUWalkerCmd); ASSERT_NE(cmdList.end(), itorGPGPUWalkerCmd); // Check MI_STORE_REGISTER_MEMs auto itorBeforeMI = reverse_find(rItorGPGPUWalkerCmd, cmdList.rbegin()); ASSERT_NE(cmdList.rbegin(), itorBeforeMI); auto pBeforeMI = genCmdCast(*itorBeforeMI); pBeforeMI = genCmdCast(*itorBeforeMI); ASSERT_NE(nullptr, pBeforeMI); EXPECT_EQ(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, pBeforeMI->getRegisterAddress()); auto itorAfterMI = find(itorGPGPUWalkerCmd, cmdList.end()); ASSERT_NE(cmdList.end(), itorAfterMI); auto pAfterMI = genCmdCast(*itorAfterMI); ASSERT_NE(nullptr, pAfterMI); EXPECT_EQ(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, pAfterMI->getRegisterAddress()); ++itorAfterMI; pAfterMI = genCmdCast(*itorAfterMI); EXPECT_EQ(nullptr, pAfterMI); clReleaseEvent(event); } /* # Two additional PIPE_CONTROLs are expected before first MI_STORE_REGISTER_MEM (which is before GPGPU_WALKER) # and after second MI_STORE_REGISTER_MEM (which is after GPGPU_WALKER). # If queue is blocked commands should be added to event */ HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GIVENCommandQueueBlockedWithProfilingWHENWalkerIsDispatchedTHENPipeControlWithTimeStampIsPresentInCS) { typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL; typedef typename FamilyType::GPGPU_WALKER GPGPU_WALKER; MockKernel kernel(program.get(), kernelInfo, *pDevice); ASSERT_EQ(CL_SUCCESS, kernel.initialize()); size_t globalOffsets[3] = {0, 0, 0}; size_t workItems[3] = {1, 1, 1}; uint32_t dimensions = 1; cl_event event; cl_event ue = new UserEvent(); static_cast *>(pCmdQ)->enqueueKernel( &kernel, dimensions, globalOffsets, workItems, nullptr, 1, // one user event to block queue &ue, // user event not signaled &event); //rseCommands(*pCmdQ); ASSERT_NE(nullptr, pCmdQ->virtualEvent); ASSERT_NE(nullptr, pCmdQ->virtualEvent->peekCommand()); NEO::LinearStream *eventCommandStream = pCmdQ->virtualEvent->peekCommand()->getCommandStream(); ASSERT_NE(nullptr, eventCommandStream); parseCommands(*eventCommandStream); // Find GPGPU_WALKER auto itorGPGPUWalkerCmd = find(cmdList.begin(), cmdList.end()); GenCmdList::reverse_iterator rItorGPGPUWalkerCmd(itorGPGPUWalkerCmd); ASSERT_NE(cmdList.end(), itorGPGPUWalkerCmd); // Check PIPE_CONTROLs auto itorBeforePC = reverse_find(rItorGPGPUWalkerCmd, cmdList.rbegin()); ASSERT_NE(cmdList.rbegin(), itorBeforePC); auto pBeforePC = genCmdCast(*itorBeforePC); ASSERT_NE(nullptr, pBeforePC); auto itorAfterPC = find(itorGPGPUWalkerCmd, cmdList.end()); ASSERT_NE(cmdList.end(), itorAfterPC); auto pAfterPC = genCmdCast(*itorAfterPC); ASSERT_NE(nullptr, pAfterPC); EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP, pBeforePC->getPostSyncOperation()); clReleaseEvent(event); ((UserEvent *)ue)->release(); pCmdQ->isQueueBlocked(); } /* # One additional MI_STORE_REGISTER_MEM is expected before and after GPGPU_WALKER. # If queue is blocked commands should be added to event */ HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GIVENCommandQueueBlockedWithProfilingWHENWalkerIsDispatchedTHENMiStoreRegisterMemIsPresentInCS) { typedef typename FamilyType::MI_STORE_REGISTER_MEM MI_STORE_REGISTER_MEM; typedef typename FamilyType::GPGPU_WALKER GPGPU_WALKER; MockKernel kernel(program.get(), kernelInfo, *pDevice); ASSERT_EQ(CL_SUCCESS, kernel.initialize()); size_t globalOffsets[3] = {0, 0, 0}; size_t workItems[3] = {1, 1, 1}; uint32_t dimensions = 1; cl_event event; cl_event ue = new UserEvent(); static_cast *>(pCmdQ)->enqueueKernel( &kernel, dimensions, globalOffsets, workItems, nullptr, 1, // one user event to block queue &ue, // user event not signaled &event); // parseCommands(*pCmdQ); ASSERT_NE(nullptr, pCmdQ->virtualEvent); ASSERT_NE(nullptr, pCmdQ->virtualEvent->peekCommand()); NEO::LinearStream *eventCommandStream = pCmdQ->virtualEvent->peekCommand()->getCommandStream(); ASSERT_NE(nullptr, eventCommandStream); parseCommands(*eventCommandStream); // Find GPGPU_WALKER auto itorGPGPUWalkerCmd = find(cmdList.begin(), cmdList.end()); GenCmdList::reverse_iterator rItorGPGPUWalkerCmd(itorGPGPUWalkerCmd); ASSERT_NE(cmdList.end(), itorGPGPUWalkerCmd); // Check MI_STORE_REGISTER_MEMs auto itorBeforeMI = reverse_find(rItorGPGPUWalkerCmd, cmdList.rbegin()); ASSERT_NE(cmdList.rbegin(), itorBeforeMI); auto pBeforeMI = genCmdCast(*itorBeforeMI); pBeforeMI = genCmdCast(*itorBeforeMI); ASSERT_NE(nullptr, pBeforeMI); EXPECT_EQ(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, pBeforeMI->getRegisterAddress()); auto itorAfterMI = find(itorGPGPUWalkerCmd, cmdList.end()); ASSERT_NE(cmdList.end(), itorAfterMI); auto pAfterMI = genCmdCast(*itorAfterMI); ASSERT_NE(nullptr, pAfterMI); EXPECT_EQ(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, pAfterMI->getRegisterAddress()); ++itorAfterMI; EXPECT_EQ(itorAfterMI, cmdList.end()); clReleaseEvent(event); ((UserEvent *)ue)->release(); pCmdQ->isQueueBlocked(); } HWTEST_F(ProfilingTests, givenNonKernelEnqueueWhenNonBlockedEnqueueThenSetCpuPath) { cl_event event; pCmdQ->enqueueMarkerWithWaitList(0, nullptr, &event); auto eventObj = static_cast(event); EXPECT_TRUE(eventObj->isCPUProfilingPath() == CL_TRUE); pCmdQ->finish(); uint64_t queued, submit, start, end; cl_int retVal; retVal = eventObj->getEventProfilingInfo(CL_PROFILING_COMMAND_QUEUED, sizeof(uint64_t), &queued, 0); EXPECT_EQ(CL_SUCCESS, retVal); retVal = eventObj->getEventProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, sizeof(uint64_t), &submit, 0); EXPECT_EQ(CL_SUCCESS, retVal); retVal = eventObj->getEventProfilingInfo(CL_PROFILING_COMMAND_START, sizeof(uint64_t), &start, 0); EXPECT_EQ(CL_SUCCESS, retVal); retVal = eventObj->getEventProfilingInfo(CL_PROFILING_COMMAND_END, sizeof(uint64_t), &end, 0); EXPECT_EQ(CL_SUCCESS, retVal); EXPECT_LT(0u, queued); EXPECT_LT(queued, submit); EXPECT_LT(submit, start); EXPECT_LT(start, end); eventObj->release(); } template struct MockTagNode : public TagNode { public: using TagNode::tagForCpuAccess; using TagNode::gfxAllocation; MockTagNode() { gfxAllocation = nullptr; tagForCpuAccess = nullptr; } }; class MyOSTime : public OSTime { public: static int instanceNum; MyOSTime() { instanceNum++; } double getDynamicDeviceTimerResolution(HardwareInfo const &hwInfo) const override { EXPECT_FALSE(true); return 1.0; } bool getCpuGpuTime(TimeStampData *pGpuCpuTime) override { EXPECT_FALSE(true); return false; } bool getCpuTime(uint64_t *timeStamp) override { EXPECT_FALSE(true); return false; }; double getHostTimerResolution() const override { EXPECT_FALSE(true); return 0; } uint64_t getCpuRawTimestamp() override { EXPECT_FALSE(true); return 0; } }; int MyOSTime::instanceNum = 0; TEST(EventProfilingTest, givenEventWhenCompleteIsZeroThenCalcProfilingDataSetsEndTimestampInCompleteTimestampAndDoesntCallOsTimeMethods) { std::unique_ptr device(MockDevice::createWithNewExecutionEnvironment(nullptr)); MyOSTime::instanceNum = 0; device->setOSTime(new MyOSTime()); EXPECT_EQ(1, MyOSTime::instanceNum); MockContext context; cl_command_queue_properties props[5] = {0, 0, 0, 0, 0}; MockCommandQueue cmdQ(&context, device.get(), props); cmdQ.setProfilingEnabled(); cmdQ.device = device.get(); HwTimeStamps timestamp; timestamp.GlobalStartTS = 10; timestamp.ContextStartTS = 20; timestamp.GlobalEndTS = 80; timestamp.ContextEndTS = 56; timestamp.GlobalCompleteTS = 0; timestamp.ContextCompleteTS = 0; MockTagNode timestampNode; timestampNode.tagForCpuAccess = ×tamp; MockEvent event(&cmdQ, CL_COMPLETE, 0, 0); event.setCPUProfilingPath(false); event.timeStampNode = ×tampNode; event.calcProfilingData(); EXPECT_EQ(timestamp.ContextEndTS, timestamp.ContextCompleteTS); cmdQ.device = nullptr; event.timeStampNode = nullptr; } TEST(EventProfilingTest, givenRawTimestampsDebugModeWhenDataIsQueriedThenRawDataIsReturned) { DebugManagerStateRestore stateRestore; DebugManager.flags.ReturnRawGpuTimestamps.set(1); std::unique_ptr device(MockDevice::createWithNewExecutionEnvironment(nullptr)); MyOSTime::instanceNum = 0; device->setOSTime(new MyOSTime()); EXPECT_EQ(1, MyOSTime::instanceNum); MockContext context; cl_command_queue_properties props[5] = {0, 0, 0, 0, 0}; MockCommandQueue cmdQ(&context, device.get(), props); cmdQ.setProfilingEnabled(); cmdQ.device = device.get(); HwTimeStamps timestamp; timestamp.GlobalStartTS = 10; timestamp.ContextStartTS = 20; timestamp.GlobalEndTS = 80; timestamp.ContextEndTS = 56; timestamp.GlobalCompleteTS = 0; timestamp.ContextCompleteTS = 70; MockTagNode timestampNode; timestampNode.tagForCpuAccess = ×tamp; MockEvent event(&cmdQ, CL_COMPLETE, 0, 0); cl_event clEvent = &event; event.queueTimeStamp.CPUTimeinNS = 1; event.queueTimeStamp.GPUTimeStamp = 2; event.submitTimeStamp.CPUTimeinNS = 3; event.submitTimeStamp.GPUTimeStamp = 4; event.setCPUProfilingPath(false); event.timeStampNode = ×tampNode; event.calcProfilingData(); cl_ulong queued, submited, start, end, complete; clGetEventProfilingInfo(clEvent, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &queued, nullptr); clGetEventProfilingInfo(clEvent, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &submited, nullptr); clGetEventProfilingInfo(clEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, nullptr); clGetEventProfilingInfo(clEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, nullptr); clGetEventProfilingInfo(clEvent, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &complete, nullptr); EXPECT_EQ(timestamp.ContextCompleteTS, complete); EXPECT_EQ(timestamp.ContextEndTS, end); EXPECT_EQ(timestamp.ContextStartTS, start); EXPECT_EQ(event.submitTimeStamp.GPUTimeStamp, submited); EXPECT_EQ(event.queueTimeStamp.GPUTimeStamp, queued); event.timeStampNode = nullptr; } struct ProfilingWithPerfCountersTests : public ProfilingTests, public PerformanceCountersFixture { void SetUp() override { PerformanceCountersFixture::SetUp(); ProfilingTests::SetUp(); createPerfCounters(); pDevice->setPerfCounters(performanceCountersBase.release()); } void TearDown() override { ProfilingTests::TearDown(); PerformanceCountersFixture::TearDown(); } template GenCmdList::iterator expectStoreRegister(GenCmdList::iterator itor, uint64_t memoryAddress, uint32_t registerAddress) { using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM; itor = find(itor, cmdList.end()); EXPECT_NE(cmdList.end(), itor); auto pStore = genCmdCast(*itor); EXPECT_EQ(memoryAddress, pStore->getMemoryAddress()); EXPECT_EQ(registerAddress, pStore->getRegisterAddress()); itor++; return itor; } }; HWTEST_F(ProfilingWithPerfCountersTests, GIVENCommandQueueWithProfilingPerfCounterAndForWorkloadWithNoKernelWHENGetCSFromCmdQueueTHENEnoughSpaceInCS) { typedef typename FamilyType::MI_STORE_REGISTER_MEM MI_STORE_REGISTER_MEM; typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL; typedef typename FamilyType::WALKER_TYPE GPGPU_WALKER; pCmdQ->setPerfCountersEnabled(true, 0); uint64_t requiredSize = 2 * sizeof(PIPE_CONTROL) + 4 * sizeof(MI_STORE_REGISTER_MEM); MultiDispatchInfo multiDispatchInfo(nullptr); auto &commandStreamMigrateMemObjects = getCommandStream(*pCmdQ, CsrDependencies(), true, true, false, multiDispatchInfo, nullptr, 0); auto expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_MIGRATE_MEM_OBJECTS, true, true, *pCmdQ, nullptr); EXPECT_GE(expectedSizeCS, requiredSize); EXPECT_GE(commandStreamMigrateMemObjects.getAvailableSpace(), requiredSize); auto &commandStreamMarker = getCommandStream(*pCmdQ, CsrDependencies(), true, true, false, multiDispatchInfo, nullptr, 0); expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_MARKER, true, true, *pCmdQ, nullptr); EXPECT_GE(expectedSizeCS, requiredSize); EXPECT_GE(commandStreamMarker.getAvailableSpace(), requiredSize); pCmdQ->setPerfCountersEnabled(false, UINT32_MAX); } HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingWithPerfCountersTests, GIVENCommandQueueWithProfilingPerfCountersWHENWalkerIsDispatchedTHENPipeControlWithTimeStampIsPresentInCS) { typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL; typedef typename FamilyType::GPGPU_WALKER GPGPU_WALKER; typedef typename FamilyType::MI_REPORT_PERF_COUNT MI_REPORT_PERF_COUNT; pCmdQ->setPerfCountersEnabled(true, 0); MockKernel kernel(program.get(), kernelInfo, *pDevice); ASSERT_EQ(CL_SUCCESS, kernel.initialize()); size_t globalOffsets[3] = {0, 0, 0}; size_t workItems[3] = {1, 1, 1}; uint32_t dimensions = 1; cl_event event; cl_kernel clKernel = &kernel; static_cast *>(pCmdQ)->enqueueKernel( clKernel, dimensions, globalOffsets, workItems, nullptr, 0, nullptr, &event); parseCommands(*pCmdQ); // expect MI_REPORT_PERF_COUNT before WALKER auto itorBeforeReportPerf = find(cmdList.begin(), cmdList.end()); ASSERT_NE(cmdList.end(), itorBeforeReportPerf); // Find GPGPU_WALKER auto itorGPGPUWalkerCmd = find(itorBeforeReportPerf, cmdList.end()); GenCmdList::reverse_iterator rItorGPGPUWalkerCmd(itorGPGPUWalkerCmd); ASSERT_NE(cmdList.end(), itorGPGPUWalkerCmd); // Check PIPE_CONTROLs auto itorBeforePC = reverse_find(rItorGPGPUWalkerCmd, cmdList.rbegin()); ASSERT_NE(cmdList.rbegin(), itorBeforePC); auto pBeforePC = genCmdCast(*itorBeforePC); ASSERT_NE(nullptr, pBeforePC); EXPECT_EQ(1u, pBeforePC->getCommandStreamerStallEnable()); auto itorAfterPC = find(itorGPGPUWalkerCmd, cmdList.end()); ASSERT_NE(cmdList.end(), itorAfterPC); auto pAfterPC = genCmdCast(*itorAfterPC); ASSERT_NE(nullptr, pAfterPC); EXPECT_EQ(1u, pAfterPC->getCommandStreamerStallEnable()); EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP, pBeforePC->getPostSyncOperation()); // expect MI_REPORT_PERF_COUNT after WALKER auto itorAfterReportPerf = find(itorGPGPUWalkerCmd, cmdList.end()); ASSERT_NE(cmdList.end(), itorAfterReportPerf); EXPECT_TRUE(static_cast *>(event)->calcProfilingData()); clReleaseEvent(event); pCmdQ->setPerfCountersEnabled(false, UINT32_MAX); } HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingWithPerfCountersTests, GIVENCommandQueueWithProfilingPerfCountersNoUserRegistersWHENWalkerIsDispatchedTHENPipeControlWithTimeStampIsPresentInCS) { typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL; typedef typename FamilyType::GPGPU_WALKER GPGPU_WALKER; typedef typename FamilyType::MI_REPORT_PERF_COUNT MI_REPORT_PERF_COUNT; pCmdQ->setPerfCountersEnabled(true, 0); MockKernel kernel(program.get(), kernelInfo, *pDevice); ASSERT_EQ(CL_SUCCESS, kernel.initialize()); size_t globalOffsets[3] = {0, 0, 0}; size_t workItems[3] = {1, 1, 1}; uint32_t dimensions = 1; cl_event event; cl_kernel clKernel = &kernel; static_cast *>(pCmdQ)->enqueueKernel( clKernel, dimensions, globalOffsets, workItems, nullptr, 0, nullptr, &event); parseCommands(*pCmdQ); // expect MI_REPORT_PERF_COUNT before WALKER auto itorBeforeReportPerf = find(cmdList.begin(), cmdList.end()); ASSERT_NE(cmdList.end(), itorBeforeReportPerf); // Find GPGPU_WALKER auto itorGPGPUWalkerCmd = find(itorBeforeReportPerf, cmdList.end()); GenCmdList::reverse_iterator rItorGPGPUWalkerCmd(itorGPGPUWalkerCmd); ASSERT_NE(cmdList.end(), itorGPGPUWalkerCmd); // Check PIPE_CONTROLs auto itorBeforePC = reverse_find(rItorGPGPUWalkerCmd, cmdList.rbegin()); ASSERT_NE(cmdList.rbegin(), itorBeforePC); auto pBeforePC = genCmdCast(*itorBeforePC); ASSERT_NE(nullptr, pBeforePC); EXPECT_EQ(1u, pBeforePC->getCommandStreamerStallEnable()); auto itorAfterPC = find(itorGPGPUWalkerCmd, cmdList.end()); ASSERT_NE(cmdList.end(), itorAfterPC); auto pAfterPC = genCmdCast(*itorAfterPC); ASSERT_NE(nullptr, pAfterPC); EXPECT_EQ(1u, pAfterPC->getCommandStreamerStallEnable()); EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP, pBeforePC->getPostSyncOperation()); // expect MI_REPORT_PERF_COUNT after WALKER auto itorAfterReportPerf = find(itorGPGPUWalkerCmd, cmdList.end()); ASSERT_NE(cmdList.end(), itorAfterReportPerf); EXPECT_TRUE(static_cast *>(event)->calcProfilingData()); clReleaseEvent(event); pCmdQ->setPerfCountersEnabled(false, UINT32_MAX); } HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingWithPerfCountersTests, GIVENCommandQueueBlockedWithProflingPerfCounterWHENWalkerIsDispatchedTHENPipeControlWithTimeStampIsPresentInCS) { typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL; typedef typename FamilyType::GPGPU_WALKER GPGPU_WALKER; typedef typename FamilyType::MI_REPORT_PERF_COUNT MI_REPORT_PERF_COUNT; pCmdQ->setPerfCountersEnabled(true, 0); MockKernel kernel(program.get(), kernelInfo, *pDevice); ASSERT_EQ(CL_SUCCESS, kernel.initialize()); size_t globalOffsets[3] = {0, 0, 0}; size_t workItems[3] = {1, 1, 1}; uint32_t dimensions = 1; cl_event event; cl_event ue = new UserEvent(); static_cast *>(pCmdQ)->enqueueKernel( &kernel, dimensions, globalOffsets, workItems, nullptr, 1, // one user event to block queue &ue, // user event not signaled &event); //rseCommands(*pCmdQ); ASSERT_NE(nullptr, pCmdQ->virtualEvent); ASSERT_NE(nullptr, pCmdQ->virtualEvent->peekCommand()); NEO::LinearStream *eventCommandStream = pCmdQ->virtualEvent->peekCommand()->getCommandStream(); ASSERT_NE(nullptr, eventCommandStream); parseCommands(*eventCommandStream); // expect MI_REPORT_PERF_COUNT before WALKER auto itorBeforeReportPerf = find(cmdList.begin(), cmdList.end()); ASSERT_NE(cmdList.end(), itorBeforeReportPerf); // Find GPGPU_WALKER auto itorGPGPUWalkerCmd = find(itorBeforeReportPerf, cmdList.end()); GenCmdList::reverse_iterator rItorGPGPUWalkerCmd(itorGPGPUWalkerCmd); ASSERT_NE(cmdList.end(), itorGPGPUWalkerCmd); // Check PIPE_CONTROLs auto itorBeforePC = reverse_find(rItorGPGPUWalkerCmd, cmdList.rbegin()); ASSERT_NE(cmdList.rbegin(), itorBeforePC); auto pBeforePC = genCmdCast(*itorBeforePC); ASSERT_NE(nullptr, pBeforePC); auto itorAfterPC = find(itorGPGPUWalkerCmd, cmdList.end()); ASSERT_NE(cmdList.end(), itorAfterPC); auto pAfterPC = genCmdCast(*itorAfterPC); ASSERT_NE(nullptr, pAfterPC); EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP, pBeforePC->getPostSyncOperation()); // expect MI_REPORT_PERF_COUNT after WALKER auto itorAfterReportPerf = find(itorGPGPUWalkerCmd, cmdList.end()); ASSERT_NE(cmdList.end(), itorAfterReportPerf); clReleaseEvent(event); ((UserEvent *)ue)->release(); pCmdQ->isQueueBlocked(); pCmdQ->setPerfCountersEnabled(false, UINT32_MAX); } HWTEST_F(ProfilingWithPerfCountersTests, GIVENCommandQueueWithProfilingPerfCountersNoEventWHENWalkerIsDispatchedTHENPipeControlWithTimeStampIsNotPresentInCS) { typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL; typedef typename FamilyType::WALKER_TYPE GPGPU_WALKER; typedef typename FamilyType::MI_REPORT_PERF_COUNT MI_REPORT_PERF_COUNT; pCmdQ->setPerfCountersEnabled(true, 0); MockKernel kernel(program.get(), kernelInfo, *pDevice); ASSERT_EQ(CL_SUCCESS, kernel.initialize()); size_t globalOffsets[3] = {0, 0, 0}; size_t workItems[3] = {1, 1, 1}; uint32_t dimensions = 1; cl_kernel clKernel = &kernel; static_cast *>(pCmdQ)->enqueueKernel( clKernel, dimensions, globalOffsets, workItems, nullptr, 0, nullptr, nullptr); parseCommands(*pCmdQ); // expect no MI_REPORT_PERF_COUNT before WALKER auto itorBeforeReportPerf = find(cmdList.begin(), cmdList.end()); ASSERT_EQ(cmdList.end(), itorBeforeReportPerf); // Find GPGPU_WALKER auto itorGPGPUWalkerCmd = find(cmdList.begin(), cmdList.end()); GenCmdList::reverse_iterator rItorGPGPUWalkerCmd(itorGPGPUWalkerCmd); ASSERT_NE(cmdList.end(), itorGPGPUWalkerCmd); // Check PIPE_CONTROLs auto itorBeforePC = reverse_find(rItorGPGPUWalkerCmd, cmdList.rbegin()); ASSERT_NE(cmdList.rbegin(), itorBeforePC); auto pBeforePC = genCmdCast(*itorBeforePC); ASSERT_NE(nullptr, pBeforePC); EXPECT_EQ(1u, pBeforePC->getCommandStreamerStallEnable()); auto itorAfterPC = find(itorGPGPUWalkerCmd, cmdList.end()); ASSERT_NE(cmdList.end(), itorAfterPC); auto pAfterPC = genCmdCast(*itorAfterPC); ASSERT_NE(nullptr, pAfterPC); EXPECT_EQ(1u, pAfterPC->getCommandStreamerStallEnable()); EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION_NO_WRITE, pBeforePC->getPostSyncOperation()); // expect MI_REPORT_PERF_COUNT after WALKER auto itorAfterReportPerf = find(itorGPGPUWalkerCmd, cmdList.end()); ASSERT_EQ(cmdList.end(), itorAfterReportPerf); pCmdQ->setPerfCountersEnabled(false, UINT32_MAX); } template struct FixedGpuAddressTagAllocator : TagAllocator { struct MockTagNode : TagNode { void setGpuAddress(uint64_t value) { this->gpuAddress = value; } }; FixedGpuAddressTagAllocator(CommandStreamReceiver &csr, uint64_t gpuAddress) : TagAllocator(csr.getMemoryManager(), csr.getPreferredTagPoolSize(), MemoryConstants::cacheLineSize) { auto tag = reinterpret_cast(this->freeTags.peekHead()); tag->setGpuAddress(gpuAddress); } }; HWTEST_F(ProfilingWithPerfCountersTests, GIVENCommandQueueWithProfilingPerfCountersWHENWalkerIsDispatchedTHENRegisterStoresArePresentInCS) { uint64_t timeStampGpuAddress = 0x123456000; uint64_t perfCountersGpuAddress = 0xabcdef000; auto &csr = pDevice->getUltCommandStreamReceiver(); csr.profilingTimeStampAllocator.reset(new FixedGpuAddressTagAllocator(csr, timeStampGpuAddress)); csr.perfCounterAllocator.reset(new FixedGpuAddressTagAllocator(csr, perfCountersGpuAddress)); pCmdQ->setPerfCountersEnabled(true, 0); MockKernel kernel(program.get(), kernelInfo, *pDevice); ASSERT_EQ(CL_SUCCESS, kernel.initialize()); size_t globalOffsets[3] = {0, 0, 0}; size_t workItems[3] = {1, 1, 1}; uint32_t dimensions = 1; cl_event event; cl_kernel clKernel = &kernel; static_cast *>(pCmdQ)->enqueueKernel( clKernel, dimensions, globalOffsets, workItems, nullptr, 0, nullptr, &event); auto pEvent = static_cast *>(event); EXPECT_EQ(pEvent->getHwTimeStampNode()->getGpuAddress(), timeStampGpuAddress); EXPECT_EQ(pEvent->getHwPerfCounterNode()->getGpuAddress(), perfCountersGpuAddress); parseCommands(*pCmdQ); auto itor = expectStoreRegister(cmdList.begin(), timeStampGpuAddress + offsetof(HwTimeStamps, ContextStartTS), GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW); // after WALKER: itor = expectStoreRegister(itor, timeStampGpuAddress + offsetof(HwTimeStamps, ContextEndTS), GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW); EXPECT_TRUE(pEvent->calcProfilingData()); clReleaseEvent(event); pCmdQ->setPerfCountersEnabled(false, UINT32_MAX); } struct MockTimestampContainer : public TimestampPacketContainer { ~MockTimestampContainer() override { for (const auto &node : timestampPacketNodes) { delete node->tagForCpuAccess; delete node; } timestampPacketNodes.clear(); } }; struct ProfilingTimestampPacketsTest : public ::testing::Test { void SetUp() override { DebugManager.flags.ReturnRawGpuTimestamps.set(true); cmdQ->setProfilingEnabled(); ev->timestampPacketContainer = std::make_unique(); } void addTimestampNode(int contextStart, int contextEnd, int globalStart) { auto node = new MockTagNode(); auto timestampPacketStorage = new TimestampPacketStorage(); node->tagForCpuAccess = timestampPacketStorage; timestampPacketStorage->packets[0].contextStart = contextStart; timestampPacketStorage->packets[0].contextEnd = contextEnd; timestampPacketStorage->packets[0].globalStart = globalStart; ev->timestampPacketContainer->add(node); } DebugManagerStateRestore restorer; MockContext context; cl_command_queue_properties props[5] = {0, 0, 0, 0, 0}; ReleaseableObjectPtr cmdQ = clUniquePtr(new MockCommandQueue(&context, context.getDevice(0), props)); ReleaseableObjectPtr> ev = clUniquePtr(new MockEvent(cmdQ.get(), CL_COMMAND_USER, Event::eventNotReady, Event::eventNotReady)); }; TEST_F(ProfilingTimestampPacketsTest, givenTimestampsPacketContainerWithOneElementAndTimestampNodeWhenCalculatingProfilingThenTimesAreTakenFromPacket) { addTimestampNode(10, 11, 12); HwTimeStamps hwTimestamps; hwTimestamps.ContextStartTS = 100; hwTimestamps.ContextEndTS = 110; hwTimestamps.GlobalStartTS = 120; MockTagNode hwTimestampsNode; hwTimestampsNode.tagForCpuAccess = &hwTimestamps; ev->timeStampNode = &hwTimestampsNode; ev->calcProfilingData(); EXPECT_EQ(10u, ev->getStartTimeStamp()); EXPECT_EQ(11u, ev->getEndTimeStamp()); EXPECT_EQ(12u, ev->getGlobalStartTimestamp()); ev->timeStampNode = nullptr; } TEST_F(ProfilingTimestampPacketsTest, givenTimestampsPacketContainerWithThreeElementsWhenCalculatingProfilingThenTimesAreTakenFromProperPacket) { addTimestampNode(10, 11, 12); addTimestampNode(1, 21, 22); addTimestampNode(5, 31, 2); ev->calcProfilingData(); EXPECT_EQ(1u, ev->getStartTimeStamp()); EXPECT_EQ(31u, ev->getEndTimeStamp()); EXPECT_EQ(2u, ev->getGlobalStartTimestamp()); } TEST_F(ProfilingTimestampPacketsTest, givenTimestampsPacketContainerWithZeroElementsWhenCalculatingProfilingThenDataIsNotCalculated) { EXPECT_EQ(0u, ev->timestampPacketContainer->peekNodes().size()); ev->calcProfilingData(); EXPECT_FALSE(ev->getDataCalcStatus()); } } // namespace NEO