/* * Copyright (C) 2018-2021 Intel Corporation * * SPDX-License-Identifier: MIT * */ #include "shared/source/os_interface/os_interface.h" #include "shared/source/utilities/tag_allocator.h" #include "shared/test/common/helpers/debug_manager_state_restore.h" #include "shared/test/unit_test/utilities/base_object_utils.h" #include "opencl/source/command_queue/command_queue_hw.h" #include "opencl/source/command_queue/enqueue_common.h" #include "opencl/source/command_queue/enqueue_kernel.h" #include "opencl/source/command_queue/enqueue_marker.h" #include "opencl/source/command_queue/enqueue_migrate_mem_objects.h" #include "opencl/source/helpers/dispatch_info.h" #include "opencl/test/unit_test/command_queue/command_enqueue_fixture.h" #include "opencl/test/unit_test/event/event_fixture.h" #include "opencl/test/unit_test/mocks/mock_command_queue.h" #include "opencl/test/unit_test/mocks/mock_context.h" #include "opencl/test/unit_test/mocks/mock_event.h" #include "opencl/test/unit_test/mocks/mock_kernel.h" #include "opencl/test/unit_test/mocks/mock_program.h" #include "opencl/test/unit_test/mocks/mock_timestamp_container.h" #include "opencl/test/unit_test/os_interface/mock_performance_counters.h" #include "test.h" namespace NEO { struct ProfilingTests : public CommandEnqueueFixture, public ::testing::Test { void SetUp() override { CommandEnqueueFixture::SetUp(CL_QUEUE_PROFILING_ENABLE); program = ReleaseableObjectPtr(new MockProgram(toClDeviceVector(*pClDevice))); program->setContext(&ctx); kernelInfo.kernelDescriptor.kernelAttributes.simdSize = 32; kernelInfo.setCrossThreadDataSize(sizeof(crossThreadData)); kernelInfo.setLocalIds({1, 1, 1}); kernelInfo.heapInfo.pKernelHeap = kernelIsa; kernelInfo.heapInfo.KernelHeapSize = sizeof(kernelIsa); } void TearDown() override { CommandEnqueueFixture::TearDown(); } ReleaseableObjectPtr program; SKernelBinaryHeaderCommon kernelHeader = {}; MockKernelInfo kernelInfo; MockContext ctx; uint32_t kernelIsa[32]; uint32_t crossThreadData[32]; }; HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GivenCommandQueueWithProfilingAndForWorkloadWithKernelWhenGetCSFromCmdQueueThenEnoughSpaceInCS) { typedef typename FamilyType::MI_STORE_REGISTER_MEM MI_STORE_REGISTER_MEM; typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL; typedef typename FamilyType::GPGPU_WALKER GPGPU_WALKER; MockKernel kernel(program.get(), kernelInfo, *pClDevice); uint64_t requiredSize = 2 * sizeof(PIPE_CONTROL) + 2 * sizeof(MI_STORE_REGISTER_MEM) + sizeof(GPGPU_WALKER) + HardwareCommandsHelper::getSizeRequiredCS(); MultiDispatchInfo multiDispatchInfo(&kernel); auto &commandStreamNDRangeKernel = getCommandStream(*pCmdQ, CsrDependencies(), true, false, false, multiDispatchInfo, nullptr, 0, false, false); auto expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, true, false, *pCmdQ, &kernel, {}); EXPECT_GE(expectedSizeCS, requiredSize); EXPECT_GE(commandStreamNDRangeKernel.getAvailableSpace(), requiredSize); auto &commandStreamTask = getCommandStream(*pCmdQ, CsrDependencies(), true, false, false, multiDispatchInfo, nullptr, 0, false, false); expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_TASK, true, false, *pCmdQ, &kernel, {}); EXPECT_GE(expectedSizeCS, requiredSize); EXPECT_GE(commandStreamTask.getAvailableSpace(), requiredSize); } HWTEST_F(ProfilingTests, GivenCommandQueueWithProfilingAndForWorkloadWithNoKernelWhenGetCSFromCmdQueueThenEnoughSpaceInCS) { typedef typename FamilyType::MI_STORE_REGISTER_MEM MI_STORE_REGISTER_MEM; typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL; typedef typename FamilyType::WALKER_TYPE GPGPU_WALKER; uint64_t requiredSize = 2 * sizeof(PIPE_CONTROL) + 4 * sizeof(MI_STORE_REGISTER_MEM); MultiDispatchInfo multiDispatchInfo(nullptr); auto &commandStreamMigrateMemObjects = getCommandStream(*pCmdQ, CsrDependencies(), true, false, false, multiDispatchInfo, nullptr, 0, false, false); auto expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_MIGRATE_MEM_OBJECTS, true, false, *pCmdQ, nullptr, {}); EXPECT_GE(expectedSizeCS, requiredSize); EXPECT_GE(commandStreamMigrateMemObjects.getAvailableSpace(), requiredSize); auto &commandStreamMarker = getCommandStream(*pCmdQ, CsrDependencies(), true, false, false, multiDispatchInfo, nullptr, 0, false, false); expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_MARKER, true, false, *pCmdQ, nullptr, {}); EXPECT_GE(expectedSizeCS, requiredSize); EXPECT_GE(commandStreamMarker.getAvailableSpace(), requiredSize); } HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GivenCommandQueueWithProfilingAndForWorkloadWithTwoKernelsInMdiWhenGetCSFromCmdQueueThenEnoughSpaceInCS) { typedef typename FamilyType::MI_STORE_REGISTER_MEM MI_STORE_REGISTER_MEM; typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL; typedef typename FamilyType::GPGPU_WALKER GPGPU_WALKER; MockKernel kernel(program.get(), kernelInfo, *pClDevice); uint64_t requiredSize = 2 * sizeof(PIPE_CONTROL) + 4 * sizeof(MI_STORE_REGISTER_MEM) + HardwareCommandsHelper::getSizeRequiredCS(); requiredSize += 2 * sizeof(GPGPU_WALKER); DispatchInfo dispatchInfo; dispatchInfo.setKernel(&kernel); MultiDispatchInfo multiDispatchInfo; multiDispatchInfo.push(dispatchInfo); multiDispatchInfo.push(dispatchInfo); auto &commandStreamTask = getCommandStream(*pCmdQ, CsrDependencies(), true, false, false, multiDispatchInfo, nullptr, 0, false, false); auto expectedSizeCS = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_TASK, CsrDependencies(), true, false, false, *pCmdQ, multiDispatchInfo, false, false); EXPECT_GE(expectedSizeCS, requiredSize); EXPECT_GE(commandStreamTask.getAvailableSpace(), requiredSize); } /* # Two additional PIPE_CONTROLs are expected before first MI_STORE_REGISTER_MEM (which is before GPGPU_WALKER) # and after second MI_STORE_REGISTER_MEM (which is after GPGPU_WALKER). */ HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GivenCommandQueueWithProfolingWhenWalkerIsDispatchedThenPipeControlWithTimeStampIsPresentInCS) { typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL; typedef typename FamilyType::GPGPU_WALKER GPGPU_WALKER; MockKernel kernel(program.get(), kernelInfo, *pClDevice); ASSERT_EQ(CL_SUCCESS, kernel.initialize()); size_t globalOffsets[3] = {0, 0, 0}; size_t workItems[3] = {1, 1, 1}; uint32_t dimensions = 1; cl_event event; static_cast *>(pCmdQ)->enqueueKernel( &kernel, dimensions, globalOffsets, workItems, nullptr, 0, nullptr, &event); parseCommands(*pCmdQ); // Find GPGPU_WALKER auto itorGPGPUWalkerCmd = find(cmdList.begin(), cmdList.end()); GenCmdList::reverse_iterator rItorGPGPUWalkerCmd(itorGPGPUWalkerCmd); ASSERT_NE(cmdList.end(), itorGPGPUWalkerCmd); // Check PIPE_CONTROLs auto itorBeforePC = reverse_find(rItorGPGPUWalkerCmd, cmdList.rbegin()); ASSERT_NE(cmdList.rbegin(), itorBeforePC); auto pBeforePC = genCmdCast(*itorBeforePC); ASSERT_NE(nullptr, pBeforePC); EXPECT_EQ(1u, pBeforePC->getCommandStreamerStallEnable()); auto itorAfterPC = find(itorGPGPUWalkerCmd, cmdList.end()); ASSERT_NE(cmdList.end(), itorAfterPC); auto pAfterPC = genCmdCast(*itorAfterPC); ASSERT_NE(nullptr, pAfterPC); EXPECT_EQ(1u, pAfterPC->getCommandStreamerStallEnable()); EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP, pBeforePC->getPostSyncOperation()); EXPECT_TRUE(static_cast *>(event)->calcProfilingData()); clReleaseEvent(event); } HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GivenCommandQueueWithProfilingWhenNonBlockedEnqueueIsExecutedThenSubmittedTimestampDoesntHaveGPUTime) { MockKernel kernel(program.get(), kernelInfo, *pClDevice); ASSERT_EQ(CL_SUCCESS, kernel.initialize()); size_t globalOffsets[3] = {0, 0, 0}; size_t workItems[3] = {1, 1, 1}; uint32_t dimensions = 1; cl_event event; static_cast *>(pCmdQ)->enqueueKernel( &kernel, dimensions, globalOffsets, workItems, nullptr, 0, nullptr, &event); auto mockEvent = static_cast *>(event); EXPECT_NE(0u, mockEvent->queueTimeStamp.GPUTimeStamp); EXPECT_NE(0u, mockEvent->queueTimeStamp.CPUTimeinNS); EXPECT_LT(mockEvent->queueTimeStamp.CPUTimeinNS, mockEvent->submitTimeStamp.CPUTimeinNS); EXPECT_EQ(0u, mockEvent->submitTimeStamp.GPUTimeStamp); clReleaseEvent(event); } /* # One additional MI_STORE_REGISTER_MEM is expected before and after GPGPU_WALKER. */ HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GivenCommandQueueWithProflingWhenWalkerIsDispatchedThenMiStoreRegisterMemIsPresentInCS) { typedef typename FamilyType::MI_STORE_REGISTER_MEM MI_STORE_REGISTER_MEM; typedef typename FamilyType::GPGPU_WALKER GPGPU_WALKER; MockKernel kernel(program.get(), kernelInfo, *pClDevice); ASSERT_EQ(CL_SUCCESS, kernel.initialize()); size_t globalOffsets[3] = {0, 0, 0}; size_t workItems[3] = {1, 1, 1}; uint32_t dimensions = 1; cl_event event; static_cast *>(pCmdQ)->enqueueKernel( &kernel, dimensions, globalOffsets, workItems, nullptr, 0, nullptr, &event); parseCommands(*pCmdQ); // Find GPGPU_WALKER auto itorGPGPUWalkerCmd = find(cmdList.begin(), cmdList.end()); GenCmdList::reverse_iterator rItorGPGPUWalkerCmd(itorGPGPUWalkerCmd); ASSERT_NE(cmdList.end(), itorGPGPUWalkerCmd); // Check MI_STORE_REGISTER_MEMs auto itorBeforeMI = reverse_find(rItorGPGPUWalkerCmd, cmdList.rbegin()); ASSERT_NE(cmdList.rbegin(), itorBeforeMI); auto pBeforeMI = genCmdCast(*itorBeforeMI); pBeforeMI = genCmdCast(*itorBeforeMI); ASSERT_NE(nullptr, pBeforeMI); EXPECT_EQ(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, pBeforeMI->getRegisterAddress()); auto itorAfterMI = find(itorGPGPUWalkerCmd, cmdList.end()); ASSERT_NE(cmdList.end(), itorAfterMI); auto pAfterMI = genCmdCast(*itorAfterMI); ASSERT_NE(nullptr, pAfterMI); EXPECT_EQ(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, pAfterMI->getRegisterAddress()); ++itorAfterMI; pAfterMI = genCmdCast(*itorAfterMI); EXPECT_EQ(nullptr, pAfterMI); clReleaseEvent(event); } /* # Two additional PIPE_CONTROLs are expected before first MI_STORE_REGISTER_MEM (which is before GPGPU_WALKER) # and after second MI_STORE_REGISTER_MEM (which is after GPGPU_WALKER). # If queue is blocked commands should be added to event */ HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GivenCommandQueueBlockedWithProfilingWhenWalkerIsDispatchedThenPipeControlWithTimeStampIsPresentInCS) { typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL; typedef typename FamilyType::GPGPU_WALKER GPGPU_WALKER; MockKernel kernel(program.get(), kernelInfo, *pClDevice); ASSERT_EQ(CL_SUCCESS, kernel.initialize()); kernel.incRefInternal(); size_t globalOffsets[3] = {0, 0, 0}; size_t workItems[3] = {1, 1, 1}; uint32_t dimensions = 1; cl_event event; cl_event ue = new UserEvent(); static_cast *>(pCmdQ)->enqueueKernel( &kernel, dimensions, globalOffsets, workItems, nullptr, 1, // one user event to block queue &ue, // user event not signaled &event); //rseCommands(*pCmdQ); ASSERT_NE(nullptr, pCmdQ->virtualEvent); ASSERT_NE(nullptr, pCmdQ->virtualEvent->peekCommand()); NEO::LinearStream *eventCommandStream = pCmdQ->virtualEvent->peekCommand()->getCommandStream(); ASSERT_NE(nullptr, eventCommandStream); parseCommands(*eventCommandStream); // Find GPGPU_WALKER auto itorGPGPUWalkerCmd = find(cmdList.begin(), cmdList.end()); GenCmdList::reverse_iterator rItorGPGPUWalkerCmd(itorGPGPUWalkerCmd); ASSERT_NE(cmdList.end(), itorGPGPUWalkerCmd); // Check PIPE_CONTROLs auto itorBeforePC = reverse_find(rItorGPGPUWalkerCmd, cmdList.rbegin()); ASSERT_NE(cmdList.rbegin(), itorBeforePC); auto pBeforePC = genCmdCast(*itorBeforePC); ASSERT_NE(nullptr, pBeforePC); auto itorAfterPC = find(itorGPGPUWalkerCmd, cmdList.end()); ASSERT_NE(cmdList.end(), itorAfterPC); auto pAfterPC = genCmdCast(*itorAfterPC); ASSERT_NE(nullptr, pAfterPC); EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP, pBeforePC->getPostSyncOperation()); clReleaseEvent(event); ((UserEvent *)ue)->release(); pCmdQ->isQueueBlocked(); } /* # One additional MI_STORE_REGISTER_MEM is expected before and after GPGPU_WALKER. # If queue is blocked commands should be added to event */ HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GivenCommandQueueBlockedWithProfilingWhenWalkerIsDispatchedThenMiStoreRegisterMemIsPresentInCS) { typedef typename FamilyType::MI_STORE_REGISTER_MEM MI_STORE_REGISTER_MEM; typedef typename FamilyType::GPGPU_WALKER GPGPU_WALKER; MockKernel kernel(program.get(), kernelInfo, *pClDevice); ASSERT_EQ(CL_SUCCESS, kernel.initialize()); kernel.incRefInternal(); size_t globalOffsets[3] = {0, 0, 0}; size_t workItems[3] = {1, 1, 1}; uint32_t dimensions = 1; cl_event event; cl_event ue = new UserEvent(); static_cast *>(pCmdQ)->enqueueKernel( &kernel, dimensions, globalOffsets, workItems, nullptr, 1, // one user event to block queue &ue, // user event not signaled &event); // parseCommands(*pCmdQ); ASSERT_NE(nullptr, pCmdQ->virtualEvent); ASSERT_NE(nullptr, pCmdQ->virtualEvent->peekCommand()); NEO::LinearStream *eventCommandStream = pCmdQ->virtualEvent->peekCommand()->getCommandStream(); ASSERT_NE(nullptr, eventCommandStream); parseCommands(*eventCommandStream); // Find GPGPU_WALKER auto itorGPGPUWalkerCmd = find(cmdList.begin(), cmdList.end()); GenCmdList::reverse_iterator rItorGPGPUWalkerCmd(itorGPGPUWalkerCmd); ASSERT_NE(cmdList.end(), itorGPGPUWalkerCmd); // Check MI_STORE_REGISTER_MEMs auto itorBeforeMI = reverse_find(rItorGPGPUWalkerCmd, cmdList.rbegin()); ASSERT_NE(cmdList.rbegin(), itorBeforeMI); auto pBeforeMI = genCmdCast(*itorBeforeMI); pBeforeMI = genCmdCast(*itorBeforeMI); ASSERT_NE(nullptr, pBeforeMI); EXPECT_EQ(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, pBeforeMI->getRegisterAddress()); auto itorAfterMI = find(itorGPGPUWalkerCmd, cmdList.end()); ASSERT_NE(cmdList.end(), itorAfterMI); auto pAfterMI = genCmdCast(*itorAfterMI); ASSERT_NE(nullptr, pAfterMI); EXPECT_EQ(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, pAfterMI->getRegisterAddress()); ++itorAfterMI; EXPECT_EQ(itorAfterMI, cmdList.end()); clReleaseEvent(event); ((UserEvent *)ue)->release(); pCmdQ->isQueueBlocked(); } HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GivenCommandQueueWithProflingWhenMarkerIsDispatchedThenPipeControlIsPresentInCS) { typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL; cl_event event; static_cast *>(pCmdQ)->enqueueMarkerWithWaitList( 0, nullptr, &event); parseCommands(*pCmdQ); // Check PIPE_CONTROLs auto itorFirstPC = find(cmdList.begin(), cmdList.end()); ASSERT_NE(cmdList.end(), itorFirstPC); auto pFirstPC = genCmdCast(*itorFirstPC); ASSERT_NE(nullptr, pFirstPC); auto itorSecondPC = find(itorFirstPC, cmdList.end()); ASSERT_NE(cmdList.end(), itorSecondPC); auto pSecondPC = genCmdCast(*itorSecondPC); ASSERT_NE(nullptr, pSecondPC); EXPECT_TRUE(static_cast *>(event)->calcProfilingData()); clReleaseEvent(event); } HWTEST_F(ProfilingTests, givenNonKernelEnqueueWhenNonBlockedEnqueueThenSetCpuPath) { cl_event event; pCmdQ->enqueueBarrierWithWaitList(0, nullptr, &event); auto eventObj = static_cast(event); EXPECT_TRUE(eventObj->isCPUProfilingPath() == CL_TRUE); pCmdQ->finish(); uint64_t queued, submit, start, end; cl_int retVal; retVal = eventObj->getEventProfilingInfo(CL_PROFILING_COMMAND_QUEUED, sizeof(uint64_t), &queued, 0); EXPECT_EQ(CL_SUCCESS, retVal); retVal = eventObj->getEventProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, sizeof(uint64_t), &submit, 0); EXPECT_EQ(CL_SUCCESS, retVal); retVal = eventObj->getEventProfilingInfo(CL_PROFILING_COMMAND_START, sizeof(uint64_t), &start, 0); EXPECT_EQ(CL_SUCCESS, retVal); retVal = eventObj->getEventProfilingInfo(CL_PROFILING_COMMAND_END, sizeof(uint64_t), &end, 0); EXPECT_EQ(CL_SUCCESS, retVal); EXPECT_LT(0u, queued); EXPECT_LT(queued, submit); EXPECT_LT(submit, start); EXPECT_LT(start, end); eventObj->release(); } HWTEST_F(ProfilingTests, givenMarkerEnqueueWhenNonBlockedEnqueueThenSetGpuPath) { cl_event event; pCmdQ->enqueueMarkerWithWaitList(0, nullptr, &event); auto eventObj = static_cast(event); EXPECT_TRUE(eventObj->isCPUProfilingPath() == CL_FALSE); pCmdQ->finish(); uint64_t queued, submit; cl_int retVal; retVal = eventObj->getEventProfilingInfo(CL_PROFILING_COMMAND_QUEUED, sizeof(uint64_t), &queued, 0); EXPECT_EQ(CL_SUCCESS, retVal); retVal = eventObj->getEventProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, sizeof(uint64_t), &submit, 0); EXPECT_EQ(CL_SUCCESS, retVal); EXPECT_LT(0u, queued); EXPECT_LT(queued, submit); eventObj->release(); } HWTEST_F(ProfilingTests, givenMarkerEnqueueWhenBlockedEnqueueThenSetGpuPath) { cl_event event = nullptr; cl_event userEvent = new UserEvent(); pCmdQ->enqueueMarkerWithWaitList(1, &userEvent, &event); auto eventObj = static_cast(event); EXPECT_FALSE(eventObj->isCPUProfilingPath()); auto userEventObj = static_cast(userEvent); pCmdQ->flush(); userEventObj->setStatus(CL_COMPLETE); Event::waitForEvents(1, &event); uint64_t queued = 0u, submit = 0u; cl_int retVal; retVal = eventObj->getEventProfilingInfo(CL_PROFILING_COMMAND_QUEUED, sizeof(uint64_t), &queued, 0); EXPECT_EQ(CL_SUCCESS, retVal); retVal = eventObj->getEventProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, sizeof(uint64_t), &submit, 0); EXPECT_EQ(CL_SUCCESS, retVal); EXPECT_LT(0u, queued); EXPECT_LT(queued, submit); eventObj->release(); userEventObj->release(); } HWTEST_F(ProfilingTests, givenMarkerEnqueueWhenBlockedEnqueueThenPipeControlsArePresentInCS) { typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL; cl_event event = nullptr; cl_event userEvent = new UserEvent(); static_cast *>(pCmdQ)->enqueueMarkerWithWaitList(1, &userEvent, &event); auto eventObj = static_cast(event); EXPECT_FALSE(eventObj->isCPUProfilingPath()); auto userEventObj = static_cast(userEvent); pCmdQ->flush(); userEventObj->setStatus(CL_COMPLETE); Event::waitForEvents(1, &event); parseCommands(*pCmdQ); // Check PIPE_CONTROLs auto itorFirstPC = find(cmdList.begin(), cmdList.end()); ASSERT_NE(cmdList.end(), itorFirstPC); auto pFirstPC = genCmdCast(*itorFirstPC); ASSERT_NE(nullptr, pFirstPC); auto itorSecondPC = find(itorFirstPC, cmdList.end()); ASSERT_NE(cmdList.end(), itorSecondPC); auto pSecondPC = genCmdCast(*itorSecondPC); ASSERT_NE(nullptr, pSecondPC); EXPECT_TRUE(static_cast *>(event)->calcProfilingData()); eventObj->release(); userEventObj->release(); pCmdQ->isQueueBlocked(); } template struct MockTagNode : public TagNode { public: using TagNode::tagForCpuAccess; using TagNode::gfxAllocation; MockTagNode() { gfxAllocation = nullptr; tagForCpuAccess = nullptr; } }; class MyOSDeviceTime : public DeviceTime { double getDynamicDeviceTimerResolution(HardwareInfo const &hwInfo) const override { EXPECT_FALSE(true); return 1.0; } uint64_t getDynamicDeviceTimerClock(HardwareInfo const &hwInfo) const override { EXPECT_FALSE(true); return 0; } bool getCpuGpuTime(TimeStampData *pGpuCpuTime, OSTime *) override { EXPECT_FALSE(true); return false; } }; class MyOSTime : public OSTime { public: static int instanceNum; MyOSTime() { instanceNum++; this->deviceTime = std::make_unique(); } bool getCpuTime(uint64_t *timeStamp) override { EXPECT_FALSE(true); return false; }; double getHostTimerResolution() const override { EXPECT_FALSE(true); return 0; } uint64_t getCpuRawTimestamp() override { EXPECT_FALSE(true); return 0; } }; int MyOSTime::instanceNum = 0; using EventProfilingTest = ProfilingTests; HWCMDTEST_F(IGFX_GEN8_CORE, EventProfilingTest, givenEventWhenCompleteIsZeroThenCalcProfilingDataSetsEndTimestampInCompleteTimestampAndDoesntCallOsTimeMethods) { auto device = std::make_unique(MockDevice::createWithNewExecutionEnvironment(nullptr)); MyOSTime::instanceNum = 0; device->setOSTime(new MyOSTime()); EXPECT_EQ(1, MyOSTime::instanceNum); MockContext context; cl_command_queue_properties props[5] = {0, 0, 0, 0, 0}; MockCommandQueue cmdQ(&context, device.get(), props, false); cmdQ.setProfilingEnabled(); cmdQ.device = device.get(); HwTimeStamps timestamp; timestamp.GlobalStartTS = 10; timestamp.ContextStartTS = 20; timestamp.GlobalEndTS = 80; timestamp.ContextEndTS = 56; timestamp.GlobalCompleteTS = 0; timestamp.ContextCompleteTS = 0; MockTagNode timestampNode; timestampNode.tagForCpuAccess = ×tamp; MockEvent event(&cmdQ, CL_COMPLETE, 0, 0); event.setCPUProfilingPath(false); event.timeStampNode = ×tampNode; event.calcProfilingData(); EXPECT_EQ(timestamp.ContextEndTS, timestamp.ContextCompleteTS); cmdQ.device = nullptr; event.timeStampNode = nullptr; } using EventProfilingTests = ProfilingTests; HWCMDTEST_F(IGFX_GEN8_CORE, EventProfilingTests, givenRawTimestampsDebugModeWhenDataIsQueriedThenRawDataIsReturned) { DebugManagerStateRestore stateRestore; DebugManager.flags.ReturnRawGpuTimestamps.set(1); auto device = std::make_unique(MockDevice::createWithNewExecutionEnvironment(nullptr)); MyOSTime::instanceNum = 0; device->setOSTime(new MyOSTime()); EXPECT_EQ(1, MyOSTime::instanceNum); MockContext context; cl_command_queue_properties props[5] = {0, 0, 0, 0, 0}; MockCommandQueue cmdQ(&context, device.get(), props, false); cmdQ.setProfilingEnabled(); cmdQ.device = device.get(); HwTimeStamps timestamp; timestamp.GlobalStartTS = 10; timestamp.ContextStartTS = 20; timestamp.GlobalEndTS = 80; timestamp.ContextEndTS = 56; timestamp.GlobalCompleteTS = 0; timestamp.ContextCompleteTS = 70; MockTagNode timestampNode; timestampNode.tagForCpuAccess = ×tamp; MockEvent event(&cmdQ, CL_COMPLETE, 0, 0); cl_event clEvent = &event; event.queueTimeStamp.CPUTimeinNS = 1; event.queueTimeStamp.GPUTimeStamp = 2; event.submitTimeStamp.CPUTimeinNS = 3; event.submitTimeStamp.GPUTimeStamp = 4; event.setCPUProfilingPath(false); event.timeStampNode = ×tampNode; event.calcProfilingData(); cl_ulong queued, submited, start, end, complete; clGetEventProfilingInfo(clEvent, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &queued, nullptr); clGetEventProfilingInfo(clEvent, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &submited, nullptr); clGetEventProfilingInfo(clEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, nullptr); clGetEventProfilingInfo(clEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, nullptr); clGetEventProfilingInfo(clEvent, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &complete, nullptr); EXPECT_EQ(timestamp.ContextCompleteTS, complete); EXPECT_EQ(timestamp.ContextEndTS, end); EXPECT_EQ(timestamp.ContextStartTS, start); EXPECT_EQ(event.submitTimeStamp.GPUTimeStamp, submited); EXPECT_EQ(event.queueTimeStamp.GPUTimeStamp, queued); event.timeStampNode = nullptr; } HWCMDTEST_F(IGFX_GEN8_CORE, EventProfilingTest, givenRawTimestampsDebugModeWhenStartTimeStampLTQueueTimeStampThenIncreaseStartTimeStamp) { DebugManagerStateRestore stateRestore; DebugManager.flags.ReturnRawGpuTimestamps.set(1); auto device = std::make_unique(MockDevice::createWithNewExecutionEnvironment(nullptr)); MyOSTime::instanceNum = 0; device->setOSTime(new MyOSTime()); EXPECT_EQ(1, MyOSTime::instanceNum); MockContext context(device.get()); MockCommandQueue cmdQ(&context, device.get(), nullptr, false); cmdQ.setProfilingEnabled(); cmdQ.device = device.get(); HwTimeStamps timestamp; timestamp.GlobalStartTS = 0; timestamp.ContextStartTS = 20; timestamp.GlobalEndTS = 80; timestamp.ContextEndTS = 56; timestamp.GlobalCompleteTS = 0; timestamp.ContextCompleteTS = 70; MockTagNode timestampNode; timestampNode.tagForCpuAccess = ×tamp; MockEvent event(&cmdQ, CL_COMPLETE, 0, 0); cl_event clEvent = &event; event.queueTimeStamp.CPUTimeinNS = 83; event.queueTimeStamp.GPUTimeStamp = 1; event.setCPUProfilingPath(false); event.timeStampNode = ×tampNode; event.calcProfilingData(); cl_ulong queued, start; clGetEventProfilingInfo(clEvent, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &queued, nullptr); clGetEventProfilingInfo(clEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, nullptr); EXPECT_LT(queued, start); event.timeStampNode = nullptr; } struct ProfilingWithPerfCountersTests : public PerformanceCountersFixture, ::testing::Test { void SetUp() override { SetUp(defaultHwInfo.get()); } void SetUp(const NEO::HardwareInfo *hardwareInfo) { PerformanceCountersFixture::SetUp(); createPerfCounters(); HardwareInfo hwInfo = *hardwareInfo; if (hwInfo.capabilityTable.defaultEngineType == aub_stream::EngineType::ENGINE_CCS) { hwInfo.featureTable.ftrCCSNode = true; } pDevice = MockDevice::createWithNewExecutionEnvironment(&hwInfo, 0); pClDevice = std::make_unique(*pDevice, nullptr); pDevice->setPerfCounters(performanceCountersBase.release()); context = std::make_unique(pClDevice.get()); cl_int retVal = CL_SUCCESS; cl_queue_properties properties[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0}; pCmdQ.reset(CommandQueue::create(context.get(), pClDevice.get(), properties, false, retVal)); kernel = std::make_unique(*pClDevice); } void TearDown() override { PerformanceCountersFixture::TearDown(); } template GenCmdList::iterator expectStoreRegister(const GenCmdList &cmdList, GenCmdList::iterator itor, uint64_t memoryAddress, uint32_t registerAddress) { using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM; itor = find(itor, cmdList.end()); EXPECT_NE(cmdList.end(), itor); auto pStore = genCmdCast(*itor); EXPECT_EQ(memoryAddress, pStore->getMemoryAddress()); EXPECT_EQ(registerAddress, pStore->getRegisterAddress()); itor++; return itor; } MockDevice *pDevice = nullptr; std::unique_ptr pClDevice; std::unique_ptr context; std::unique_ptr pCmdQ; std::unique_ptr kernel; }; struct ProfilingWithPerfCountersOnCCSTests : ProfilingWithPerfCountersTests { void SetUp() override { auto hwInfo = *defaultHwInfo; hwInfo.capabilityTable.defaultEngineType = aub_stream::ENGINE_CCS; ProfilingWithPerfCountersTests::SetUp(&hwInfo); } void TearDown() override { ProfilingWithPerfCountersTests::TearDown(); } }; HWTEST_F(ProfilingWithPerfCountersTests, GivenCommandQueueWithProfilingPerfCounterAndForWorkloadWithNoKernelWhenGetCSFromCmdQueueThenEnoughSpaceInCS) { typedef typename FamilyType::MI_STORE_REGISTER_MEM MI_STORE_REGISTER_MEM; typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL; typedef typename FamilyType::WALKER_TYPE GPGPU_WALKER; pCmdQ->setPerfCountersEnabled(); uint64_t requiredSize = 2 * sizeof(PIPE_CONTROL) + 4 * sizeof(MI_STORE_REGISTER_MEM); MultiDispatchInfo multiDispatchInfo(nullptr); auto &commandStreamMigrateMemObjects = getCommandStream(*pCmdQ, CsrDependencies(), true, true, false, multiDispatchInfo, nullptr, 0, false, false); auto expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_MIGRATE_MEM_OBJECTS, true, true, *pCmdQ, nullptr, {}); EXPECT_GE(expectedSizeCS, requiredSize); EXPECT_GE(commandStreamMigrateMemObjects.getAvailableSpace(), requiredSize); auto &commandStreamMarker = getCommandStream(*pCmdQ, CsrDependencies(), true, true, false, multiDispatchInfo, nullptr, 0, false, false); expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_MARKER, true, true, *pCmdQ, nullptr, {}); EXPECT_GE(expectedSizeCS, requiredSize); EXPECT_GE(commandStreamMarker.getAvailableSpace(), requiredSize); } HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingWithPerfCountersTests, GivenCommandQueueWithProfilingPerfCountersWhenWalkerIsDispatchedThenPipeControlWithTimeStampIsPresentInCS) { typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL; typedef typename FamilyType::GPGPU_WALKER GPGPU_WALKER; typedef typename FamilyType::MI_REPORT_PERF_COUNT MI_REPORT_PERF_COUNT; pCmdQ->setPerfCountersEnabled(); size_t globalOffsets[3] = {0, 0, 0}; size_t workItems[3] = {1, 1, 1}; uint32_t dimensions = 1; cl_event event; static_cast *>(pCmdQ.get())->enqueueKernel(kernel->mockKernel, dimensions, globalOffsets, workItems, nullptr, 0, nullptr, &event); ClHardwareParse parse; auto &cmdList = parse.cmdList; parse.parseCommands(*pCmdQ); // expect MI_REPORT_PERF_COUNT before WALKER auto itorBeforeReportPerf = find(cmdList.begin(), cmdList.end()); ASSERT_NE(cmdList.end(), itorBeforeReportPerf); // Find GPGPU_WALKER auto itorGPGPUWalkerCmd = find(itorBeforeReportPerf, cmdList.end()); GenCmdList::reverse_iterator rItorGPGPUWalkerCmd(itorGPGPUWalkerCmd); ASSERT_NE(cmdList.end(), itorGPGPUWalkerCmd); // Check PIPE_CONTROLs auto itorBeforePC = reverse_find(rItorGPGPUWalkerCmd, cmdList.rbegin()); ASSERT_NE(cmdList.rbegin(), itorBeforePC); auto pBeforePC = genCmdCast(*itorBeforePC); ASSERT_NE(nullptr, pBeforePC); EXPECT_EQ(1u, pBeforePC->getCommandStreamerStallEnable()); auto itorAfterPC = find(itorGPGPUWalkerCmd, cmdList.end()); ASSERT_NE(cmdList.end(), itorAfterPC); auto pAfterPC = genCmdCast(*itorAfterPC); ASSERT_NE(nullptr, pAfterPC); EXPECT_EQ(1u, pAfterPC->getCommandStreamerStallEnable()); EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP, pBeforePC->getPostSyncOperation()); // expect MI_REPORT_PERF_COUNT after WALKER auto itorAfterReportPerf = find(itorGPGPUWalkerCmd, cmdList.end()); ASSERT_NE(cmdList.end(), itorAfterReportPerf); EXPECT_TRUE(static_cast *>(event)->calcProfilingData()); clReleaseEvent(event); } HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingWithPerfCountersTests, GivenCommandQueueWithProfilingPerfCountersNoUserRegistersWhenWalkerIsDispatchedThenPipeControlWithTimeStampIsPresentInCS) { typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL; typedef typename FamilyType::GPGPU_WALKER GPGPU_WALKER; typedef typename FamilyType::MI_REPORT_PERF_COUNT MI_REPORT_PERF_COUNT; pCmdQ->setPerfCountersEnabled(); size_t globalOffsets[3] = {0, 0, 0}; size_t workItems[3] = {1, 1, 1}; uint32_t dimensions = 1; cl_event event; static_cast *>(pCmdQ.get())->enqueueKernel(kernel->mockKernel, dimensions, globalOffsets, workItems, nullptr, 0, nullptr, &event); ClHardwareParse parse; auto &cmdList = parse.cmdList; parse.parseCommands(*pCmdQ); // expect MI_REPORT_PERF_COUNT before WALKER auto itorBeforeReportPerf = find(cmdList.begin(), cmdList.end()); ASSERT_NE(cmdList.end(), itorBeforeReportPerf); // Find GPGPU_WALKER auto itorGPGPUWalkerCmd = find(itorBeforeReportPerf, cmdList.end()); GenCmdList::reverse_iterator rItorGPGPUWalkerCmd(itorGPGPUWalkerCmd); ASSERT_NE(cmdList.end(), itorGPGPUWalkerCmd); // Check PIPE_CONTROLs auto itorBeforePC = reverse_find(rItorGPGPUWalkerCmd, cmdList.rbegin()); ASSERT_NE(cmdList.rbegin(), itorBeforePC); auto pBeforePC = genCmdCast(*itorBeforePC); ASSERT_NE(nullptr, pBeforePC); EXPECT_EQ(1u, pBeforePC->getCommandStreamerStallEnable()); auto itorAfterPC = find(itorGPGPUWalkerCmd, cmdList.end()); ASSERT_NE(cmdList.end(), itorAfterPC); auto pAfterPC = genCmdCast(*itorAfterPC); ASSERT_NE(nullptr, pAfterPC); EXPECT_EQ(1u, pAfterPC->getCommandStreamerStallEnable()); EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP, pBeforePC->getPostSyncOperation()); // expect MI_REPORT_PERF_COUNT after WALKER auto itorAfterReportPerf = find(itorGPGPUWalkerCmd, cmdList.end()); ASSERT_NE(cmdList.end(), itorAfterReportPerf); EXPECT_TRUE(static_cast *>(event)->calcProfilingData()); clReleaseEvent(event); } HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingWithPerfCountersTests, GivenCommandQueueBlockedWithProflingPerfCounterWhenWalkerIsDispatchedThenPipeControlWithTimeStampIsPresentInCS) { typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL; typedef typename FamilyType::GPGPU_WALKER GPGPU_WALKER; typedef typename FamilyType::MI_REPORT_PERF_COUNT MI_REPORT_PERF_COUNT; pCmdQ->setPerfCountersEnabled(); size_t globalOffsets[3] = {0, 0, 0}; size_t workItems[3] = {1, 1, 1}; uint32_t dimensions = 1; cl_event event; cl_event ue = new UserEvent(); static_cast *>(pCmdQ.get())->enqueueKernel(kernel->mockKernel, dimensions, globalOffsets, workItems, nullptr, 1, // one user event to block queue &ue, // user event not signaled &event); //rseCommands(*pCmdQ); ASSERT_NE(nullptr, pCmdQ->virtualEvent); ASSERT_NE(nullptr, pCmdQ->virtualEvent->peekCommand()); NEO::LinearStream *eventCommandStream = pCmdQ->virtualEvent->peekCommand()->getCommandStream(); ASSERT_NE(nullptr, eventCommandStream); HardwareParse parse; auto &cmdList = parse.cmdList; parse.parseCommands(*eventCommandStream); // expect MI_REPORT_PERF_COUNT before WALKER auto itorBeforeReportPerf = find(cmdList.begin(), cmdList.end()); ASSERT_NE(cmdList.end(), itorBeforeReportPerf); // Find GPGPU_WALKER auto itorGPGPUWalkerCmd = find(itorBeforeReportPerf, cmdList.end()); GenCmdList::reverse_iterator rItorGPGPUWalkerCmd(itorGPGPUWalkerCmd); ASSERT_NE(cmdList.end(), itorGPGPUWalkerCmd); // Check PIPE_CONTROLs auto itorBeforePC = reverse_find(rItorGPGPUWalkerCmd, cmdList.rbegin()); ASSERT_NE(cmdList.rbegin(), itorBeforePC); auto pBeforePC = genCmdCast(*itorBeforePC); ASSERT_NE(nullptr, pBeforePC); auto itorAfterPC = find(itorGPGPUWalkerCmd, cmdList.end()); ASSERT_NE(cmdList.end(), itorAfterPC); auto pAfterPC = genCmdCast(*itorAfterPC); ASSERT_NE(nullptr, pAfterPC); EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP, pBeforePC->getPostSyncOperation()); // expect MI_REPORT_PERF_COUNT after WALKER auto itorAfterReportPerf = find(itorGPGPUWalkerCmd, cmdList.end()); ASSERT_NE(cmdList.end(), itorAfterReportPerf); clReleaseEvent(event); ((UserEvent *)ue)->release(); pCmdQ->isQueueBlocked(); } HWTEST_F(ProfilingWithPerfCountersTests, GivenCommandQueueWithProfilingPerfCountersNoEventWhenWalkerIsDispatchedThenPipeControlWithTimeStampIsNotPresentInCS) { typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL; typedef typename FamilyType::WALKER_TYPE GPGPU_WALKER; typedef typename FamilyType::MI_REPORT_PERF_COUNT MI_REPORT_PERF_COUNT; pCmdQ->setPerfCountersEnabled(); size_t globalOffsets[3] = {0, 0, 0}; size_t workItems[3] = {1, 1, 1}; uint32_t dimensions = 1; static_cast *>(pCmdQ.get())->enqueueKernel(kernel->mockKernel, dimensions, globalOffsets, workItems, nullptr, 0, nullptr, nullptr); ClHardwareParse parse; auto &cmdList = parse.cmdList; parse.parseCommands(*pCmdQ); // expect no MI_REPORT_PERF_COUNT before WALKER auto itorBeforeReportPerf = find(cmdList.begin(), cmdList.end()); ASSERT_EQ(cmdList.end(), itorBeforeReportPerf); // Find GPGPU_WALKER auto itorGPGPUWalkerCmd = find(cmdList.begin(), cmdList.end()); GenCmdList::reverse_iterator rItorGPGPUWalkerCmd(itorGPGPUWalkerCmd); ASSERT_NE(cmdList.end(), itorGPGPUWalkerCmd); // Check PIPE_CONTROLs auto itorBeforePC = reverse_find(rItorGPGPUWalkerCmd, cmdList.rbegin()); ASSERT_NE(cmdList.rbegin(), itorBeforePC); auto pBeforePC = genCmdCast(*itorBeforePC); ASSERT_NE(nullptr, pBeforePC); EXPECT_EQ(1u, pBeforePC->getCommandStreamerStallEnable()); auto itorAfterPC = find(itorGPGPUWalkerCmd, cmdList.end()); ASSERT_NE(cmdList.end(), itorAfterPC); auto pAfterPC = genCmdCast(*itorAfterPC); ASSERT_NE(nullptr, pAfterPC); EXPECT_EQ(1u, pAfterPC->getCommandStreamerStallEnable()); EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION_NO_WRITE, pBeforePC->getPostSyncOperation()); // expect MI_REPORT_PERF_COUNT after WALKER auto itorAfterReportPerf = find(itorGPGPUWalkerCmd, cmdList.end()); ASSERT_EQ(cmdList.end(), itorAfterReportPerf); } template struct FixedGpuAddressTagAllocator : MockTagAllocator { using TagAllocator::usedTags; using TagAllocator::deferredTags; struct MockTagNode : TagNode { void setGpuAddress(uint64_t value) { this->gpuAddress = value; } }; FixedGpuAddressTagAllocator(CommandStreamReceiver &csr, uint64_t gpuAddress) : MockTagAllocator(csr.getRootDeviceIndex(), csr.getMemoryManager(), csr.getPreferredTagPoolSize(), MemoryConstants::cacheLineSize, sizeof(TagType), false, csr.getOsContext().getDeviceBitfield()) { auto tag = reinterpret_cast(this->freeTags.peekHead()); tag->setGpuAddress(gpuAddress); } }; HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingWithPerfCountersTests, GivenCommandQueueWithProfilingPerfCountersWhenWalkerIsDispatchedThenRegisterStoresArePresentInCS) { uint64_t timeStampGpuAddress = 0x123456000; uint64_t perfCountersGpuAddress = 0xabcdef000; auto &csr = pDevice->getUltCommandStreamReceiver(); csr.profilingTimeStampAllocator.reset(new FixedGpuAddressTagAllocator(csr, timeStampGpuAddress)); csr.perfCounterAllocator.reset(new FixedGpuAddressTagAllocator(csr, perfCountersGpuAddress)); pCmdQ->setPerfCountersEnabled(); size_t globalOffsets[3] = {0, 0, 0}; size_t workItems[3] = {1, 1, 1}; uint32_t dimensions = 1; cl_event event; static_cast *>(pCmdQ.get())->enqueueKernel(kernel->mockKernel, dimensions, globalOffsets, workItems, nullptr, 0, nullptr, &event); auto pEvent = static_cast *>(event); EXPECT_EQ(pEvent->getHwTimeStampNode()->getGpuAddress(), timeStampGpuAddress); EXPECT_EQ(pEvent->getHwPerfCounterNode()->getGpuAddress(), perfCountersGpuAddress); ClHardwareParse parse; auto &cmdList = parse.cmdList; parse.parseCommands(*pCmdQ); auto itor = expectStoreRegister(cmdList, cmdList.begin(), timeStampGpuAddress + offsetof(HwTimeStamps, ContextStartTS), GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW); // after WALKER: itor = expectStoreRegister(cmdList, itor, timeStampGpuAddress + offsetof(HwTimeStamps, ContextEndTS), GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW); EXPECT_TRUE(pEvent->calcProfilingData()); clReleaseEvent(event); } HWTEST_F(ProfilingWithPerfCountersTests, givenTimestampPacketsEnabledWhenEnqueueIsCalledThenDontAllocateHwTimeStamps) { auto &csr = pDevice->getUltCommandStreamReceiver(); csr.timestampPacketWriteEnabled = true; auto mockAllocator = new FixedGpuAddressTagAllocator(csr, 0x123); csr.profilingTimeStampAllocator.reset(mockAllocator); auto myCmdQ = std::make_unique>(pCmdQ->getContextPtr(), pClDevice.get(), nullptr); myCmdQ->setProfilingEnabled(); size_t globalOffsets[3] = {0, 0, 0}; size_t workItems[3] = {1, 1, 1}; cl_event event; myCmdQ->enqueueKernel(kernel->mockKernel, 1, globalOffsets, workItems, nullptr, 0, nullptr, &event); EXPECT_EQ(!!myCmdQ->getTimestampPacketContainer(), mockAllocator->usedTags.peekIsEmpty()); EXPECT_TRUE(mockAllocator->deferredTags.peekIsEmpty()); clReleaseEvent(event); } HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingWithPerfCountersOnCCSTests, givenCommandQueueBlockedWithProfilingPerfCountersWhenWalkerIsDispatchedThenPipeControlWithTimeStampIsPresentInCS) { using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER; using MI_REPORT_PERF_COUNT = typename FamilyType::MI_REPORT_PERF_COUNT; pCmdQ->setPerfCountersEnabled(); size_t globalOffsets[3] = {0, 0, 0}; size_t workItems[3] = {1, 1, 1}; uint32_t dimensions = 1; cl_event event; cl_event userEvent = clCreateUserEvent(context.get(), nullptr); CommandQueueHw *cmdQHw = static_cast *>(pCmdQ.get()); cmdQHw->enqueueKernel(kernel->mockKernel, dimensions, globalOffsets, workItems, nullptr, 1, &userEvent, &event); ASSERT_NE(nullptr, pCmdQ->virtualEvent); ASSERT_NE(nullptr, pCmdQ->virtualEvent->peekCommand()); NEO::LinearStream *eventCommandStream = pCmdQ->virtualEvent->peekCommand()->getCommandStream(); ASSERT_NE(nullptr, eventCommandStream); HardwareParse parse; auto &cmdList = parse.cmdList; parse.parseCommands(*eventCommandStream); // expect MI_REPORT_PERF_COUNT before WALKER auto itorBeforeReportPerf = find(cmdList.begin(), cmdList.end()); ASSERT_NE(cmdList.end(), itorBeforeReportPerf); // find GPGPU_WALKER auto itorGPGPUWalkerCmd = find(itorBeforeReportPerf, cmdList.end()); GenCmdList::reverse_iterator rItorGPGPUWalkerCmd(itorGPGPUWalkerCmd); ASSERT_NE(cmdList.end(), itorGPGPUWalkerCmd); // check PIPE_CONTROLs auto itorBeforePC = reverse_find(rItorGPGPUWalkerCmd, cmdList.rbegin()); ASSERT_NE(cmdList.rbegin(), itorBeforePC); auto pBeforePC = genCmdCast(*itorBeforePC); ASSERT_NE(nullptr, pBeforePC); auto itorAfterPC = find(itorGPGPUWalkerCmd, cmdList.end()); ASSERT_NE(cmdList.end(), itorAfterPC); auto pAfterPC = genCmdCast(*itorAfterPC); ASSERT_NE(nullptr, pAfterPC); EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP, pBeforePC->getPostSyncOperation()); // expect MI_REPORT_PERF_COUNT after WALKER auto itorAfterReportPerf = find(itorGPGPUWalkerCmd, cmdList.end()); ASSERT_NE(cmdList.end(), itorAfterReportPerf); clReleaseEvent(event); clReleaseEvent(userEvent); } HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingWithPerfCountersOnCCSTests, givenCommandQueueWithProfilingPerfCountersWhenWalkerIsDispatchedThenPipeControlWithTimeStampIsPresentInCS) { using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER; using MI_REPORT_PERF_COUNT = typename FamilyType::MI_REPORT_PERF_COUNT; pCmdQ->setPerfCountersEnabled(); size_t globalOffsets[3] = {0, 0, 0}; size_t workItems[3] = {1, 1, 1}; uint32_t dimensions = 1; cl_event event; CommandQueueHw *cmdQHw = static_cast *>(pCmdQ.get()); cmdQHw->enqueueKernel(kernel->mockKernel, dimensions, globalOffsets, workItems, nullptr, 0, nullptr, &event); ClHardwareParse parse; auto &cmdList = parse.cmdList; parse.parseCommands(*pCmdQ); // expect MI_REPORT_PERF_COUNT before WALKER auto itorBeforeReportPerf = find(cmdList.begin(), cmdList.end()); ASSERT_NE(cmdList.end(), itorBeforeReportPerf); // find GPGPU_WALKER auto itorGPGPUWalkerCmd = find(itorBeforeReportPerf, cmdList.end()); GenCmdList::reverse_iterator rItorGPGPUWalkerCmd(itorGPGPUWalkerCmd); ASSERT_NE(cmdList.end(), itorGPGPUWalkerCmd); // check PIPE_CONTROLs auto itorBeforePC = reverse_find(rItorGPGPUWalkerCmd, cmdList.rbegin()); ASSERT_NE(cmdList.rbegin(), itorBeforePC); auto pBeforePC = genCmdCast(*itorBeforePC); ASSERT_NE(nullptr, pBeforePC); EXPECT_EQ(1u, pBeforePC->getCommandStreamerStallEnable()); auto itorAfterPC = find(itorGPGPUWalkerCmd, cmdList.end()); ASSERT_NE(cmdList.end(), itorAfterPC); auto pAfterPC = genCmdCast(*itorAfterPC); ASSERT_NE(nullptr, pAfterPC); EXPECT_EQ(1u, pAfterPC->getCommandStreamerStallEnable()); EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP, pBeforePC->getPostSyncOperation()); // expect MI_REPORT_PERF_COUNT after WALKER auto itorAfterReportPerf = find(itorGPGPUWalkerCmd, cmdList.end()); ASSERT_NE(cmdList.end(), itorAfterReportPerf); EXPECT_TRUE(static_cast *>(event)->calcProfilingData()); clReleaseEvent(event); } struct MockTimestampContainer : public TimestampPacketContainer { ~MockTimestampContainer() override { for (const auto &node : timestampPacketNodes) { auto mockNode = static_cast> *>(node); delete mockNode->tagForCpuAccess; delete node; } timestampPacketNodes.clear(); } }; struct ProfilingTimestampPacketsTest : public ::testing::Test { void SetUp() override { DebugManager.flags.ReturnRawGpuTimestamps.set(true); cmdQ->setProfilingEnabled(); ev->timestampPacketContainer = std::make_unique(); } void addTimestampNode(uint32_t contextStart, uint32_t contextEnd, uint32_t globalStart, uint32_t globalEnd) { auto node = new MockTagNode>(); auto timestampPacketStorage = new TimestampPackets(); node->tagForCpuAccess = timestampPacketStorage; uint32_t values[4] = {contextStart, globalStart, contextEnd, globalEnd}; timestampPacketStorage->assignDataToAllTimestamps(0, values); ev->timestampPacketContainer->add(node); } void addTimestampNodeMultiOsContext(uint32_t globalStart[16], uint32_t globalEnd[16], uint32_t contextStart[16], uint32_t contextEnd[16], uint32_t size) { auto node = new MockTagNode>(); auto timestampPacketStorage = new TimestampPackets(); node->setPacketsUsed(size); for (uint32_t i = 0u; i < node->getPacketsUsed(); ++i) { uint32_t values[4] = {contextStart[i], globalStart[i], contextEnd[i], globalEnd[i]}; timestampPacketStorage->assignDataToAllTimestamps(i, values); } node->tagForCpuAccess = timestampPacketStorage; ev->timestampPacketContainer->add(node); } void initTimestampNodeMultiOsContextData(uint32_t globalStart[16], uint32_t globalEnd[16], uint32_t size) { for (uint32_t i = 0u; i < size; ++i) { globalStart[i] = 100; } globalStart[5] = {50}; for (uint32_t i = 0u; i < size; ++i) { globalEnd[i] = 200; } globalEnd[7] = {350}; } DebugManagerStateRestore restorer; MockContext context; cl_command_queue_properties props[5] = {0, 0, 0, 0, 0}; ReleaseableObjectPtr cmdQ = clUniquePtr(new MockCommandQueue(&context, context.getDevice(0), props, false)); ReleaseableObjectPtr> ev = clUniquePtr(new MockEvent(cmdQ.get(), CL_COMMAND_USER, CompletionStamp::notReady, CompletionStamp::notReady)); }; TEST_F(ProfilingTimestampPacketsTest, givenTimestampsPacketContainerWithOneElementAndTimestampNodeWhenCalculatingProfilingThenTimesAreTakenFromPacket) { addTimestampNode(10, 11, 12, 13); HwTimeStamps hwTimestamps; hwTimestamps.ContextStartTS = 100; hwTimestamps.ContextEndTS = 110; hwTimestamps.GlobalStartTS = 120; MockTagNode hwTimestampsNode; hwTimestampsNode.tagForCpuAccess = &hwTimestamps; ev->timeStampNode = &hwTimestampsNode; ev->calcProfilingData(); EXPECT_EQ(12u, ev->getStartTimeStamp()); EXPECT_EQ(13u, ev->getEndTimeStamp()); EXPECT_EQ(12u, ev->getGlobalStartTimestamp()); ev->timeStampNode = nullptr; } TEST_F(ProfilingTimestampPacketsTest, givenMultiOsContextCapableSetToTrueWhenCalcProfilingDataIsCalledThenCorrectedValuesAreReturned) { uint32_t globalStart[16] = {0}; uint32_t globalEnd[16] = {0}; uint32_t contextStart[16] = {0}; uint32_t contextEnd[16] = {0}; initTimestampNodeMultiOsContextData(globalStart, globalEnd, 16u); addTimestampNodeMultiOsContext(globalStart, globalEnd, contextStart, contextEnd, 16u); auto &device = reinterpret_cast(cmdQ->getDevice()); auto &csr = device.getUltCommandStreamReceiver(); csr.multiOsContextCapable = true; ev->calcProfilingData(); EXPECT_EQ(50u, ev->getStartTimeStamp()); EXPECT_EQ(350u, ev->getEndTimeStamp()); } TEST_F(ProfilingTimestampPacketsTest, givenTimestampPacketWithoutProfilingDataWhenCalculatingThenDontUseThatPacket) { uint32_t globalStart0 = 20; uint32_t globalEnd0 = 51; uint32_t contextStart0 = 21; uint32_t contextEnd0 = 50; uint32_t globalStart1 = globalStart0 - 1; uint32_t globalEnd1 = globalEnd0 + 1; uint32_t contextStart1 = contextStart0 - 1; uint32_t contextEnd1 = contextEnd0 + 1; addTimestampNodeMultiOsContext(&globalStart0, &globalEnd0, &contextStart0, &contextEnd0, 1); addTimestampNodeMultiOsContext(&globalStart1, &globalEnd1, &contextStart1, &contextEnd1, 1); auto &device = reinterpret_cast(cmdQ->getDevice()); auto &csr = device.getUltCommandStreamReceiver(); csr.multiOsContextCapable = true; ev->timestampPacketContainer->peekNodes()[1]->setProfilingCapable(false); ev->calcProfilingData(); EXPECT_EQ(static_cast(globalStart0), ev->getStartTimeStamp()); EXPECT_EQ(static_cast(globalEnd0), ev->getEndTimeStamp()); } TEST_F(ProfilingTimestampPacketsTest, givenPrintTimestampPacketContentsSetWhenCalcProfilingDataThenTimeStampsArePrinted) { DebugManagerStateRestore restorer; DebugManager.flags.PrintTimestampPacketContents.set(true); testing::internal::CaptureStdout(); auto &device = reinterpret_cast(cmdQ->getDevice()); auto &csr = device.getUltCommandStreamReceiver(); csr.multiOsContextCapable = true; uint32_t globalStart[16] = {0}; uint32_t globalEnd[16] = {0}; uint32_t contextStart[16] = {0}; uint32_t contextEnd[16] = {0}; for (int i = 0; i < 16; i++) { globalStart[i] = 2 * i; globalEnd[i] = 500 * i; contextStart[i] = 7 * i; contextEnd[i] = 94 * i; } addTimestampNodeMultiOsContext(globalStart, globalEnd, contextStart, contextEnd, 16u); ev->calcProfilingData(); std::string output = testing::internal::GetCapturedStdout(); std::stringstream expected; expected << "Timestamp 0, profiling capable: " << ev->timestampPacketContainer->peekNodes()[0]->isProfilingCapable() << ", "; for (int i = 0; i < 16; i++) { expected << "packet " << i << ": " << "global start: " << globalStart[i] << ", " << "global end: " << globalEnd[i] << ", " << "context start: " << contextStart[i] << ", " << "context end: " << contextEnd[i] << std::endl; } EXPECT_EQ(0, output.compare(expected.str().c_str())); } TEST_F(ProfilingTimestampPacketsTest, givenTimestampsPacketContainerWithThreeElementsWhenCalculatingProfilingThenTimesAreTakenFromProperPacket) { addTimestampNode(10, 11, 12, 13); addTimestampNode(1, 21, 22, 13); addTimestampNode(5, 31, 2, 13); ev->calcProfilingData(); EXPECT_EQ(2u, ev->getStartTimeStamp()); EXPECT_EQ(13u, ev->getEndTimeStamp()); EXPECT_EQ(2u, ev->getGlobalStartTimestamp()); } TEST_F(ProfilingTimestampPacketsTest, givenTimestampsPacketContainerWithZeroElementsWhenCalculatingProfilingThenDataIsNotCalculated) { EXPECT_EQ(0u, ev->timestampPacketContainer->peekNodes().size()); ev->calcProfilingData(); EXPECT_FALSE(ev->getDataCalcStatus()); } } // namespace NEO