Files
compute-runtime/unit_tests/profiling/profiling_tests.cpp
Maciej Dziuban 6f26ced3b5 Don't store MockProgram as member of fixture
This is to prepare for adding argument to MockProgram constructor. It'll have
to be constructed after ExecutionEnvironment creation, for example after
DeviceFixture::SetUp.

Change-Id: I37b08f814679271820a07fb29cf1fb6b517c8376
Signed-off-by: Maciej Dziuban <maciej.dziuban@intel.com>
2018-08-10 10:13:33 +02:00

807 lines
34 KiB
C++

/*
* Copyright (c) 2017 - 2018, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "runtime/command_queue/command_queue_hw.h"
#include "runtime/command_queue/enqueue_common.h"
#include "runtime/command_queue/enqueue_migrate_mem_objects.h"
#include "runtime/command_queue/enqueue_kernel.h"
#include "runtime/command_queue/enqueue_marker.h"
#include "runtime/memory_manager/surface.h"
#include "runtime/helpers/dispatch_info.h"
#include "runtime/os_interface/os_interface.h"
#include "runtime/utilities/tag_allocator.h"
#include "unit_tests/command_queue/command_enqueue_fixture.h"
#include "unit_tests/fixtures/device_fixture.h"
#include "unit_tests/mocks/mock_command_queue.h"
#include "unit_tests/mocks/mock_context.h"
#include "unit_tests/mocks/mock_event.h"
#include "unit_tests/mocks/mock_kernel.h"
#include "unit_tests/mocks/mock_program.h"
#include "unit_tests/os_interface/mock_performance_counters.h"
#include "test.h"
namespace OCLRT {
struct ProfilingTests : public CommandEnqueueFixture,
public ::testing::Test {
void SetUp() override {
CommandEnqueueFixture::SetUp(CL_QUEUE_PROFILING_ENABLE);
program = std::make_unique<MockProgram>();
memset(&kernelHeader, 0, sizeof(kernelHeader));
kernelHeader.KernelHeapSize = sizeof(kernelIsa);
memset(&dataParameterStream, 0, sizeof(dataParameterStream));
dataParameterStream.DataParameterStreamSize = sizeof(crossThreadData);
executionEnvironment = {};
memset(&executionEnvironment, 0, sizeof(executionEnvironment));
executionEnvironment.CompiledSIMD32 = 1;
executionEnvironment.LargestCompiledSIMDSize = 32;
memset(&threadPayload, 0, sizeof(threadPayload));
threadPayload.LocalIDXPresent = 1;
threadPayload.LocalIDYPresent = 1;
threadPayload.LocalIDZPresent = 1;
kernelInfo.heapInfo.pKernelHeap = kernelIsa;
kernelInfo.heapInfo.pKernelHeader = &kernelHeader;
kernelInfo.patchInfo.dataParameterStream = &dataParameterStream;
kernelInfo.patchInfo.executionEnvironment = &executionEnvironment;
kernelInfo.patchInfo.threadPayload = &threadPayload;
}
void TearDown() override {
CommandEnqueueFixture::TearDown();
}
std::unique_ptr<MockProgram> program;
SKernelBinaryHeaderCommon kernelHeader;
SPatchDataParameterStream dataParameterStream;
SPatchExecutionEnvironment executionEnvironment;
SPatchThreadPayload threadPayload;
KernelInfo kernelInfo;
uint32_t kernelIsa[32];
uint32_t crossThreadData[32];
};
HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GIVENCommandQueueWithProfilingAndForWorkloadWithKernelWHENGetCSFromCmdQueueTHENEnoughSpaceInCS) {
typedef typename FamilyType::MI_STORE_REGISTER_MEM MI_STORE_REGISTER_MEM;
typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL;
typedef typename FamilyType::GPGPU_WALKER GPGPU_WALKER;
uint64_t requiredSize = 2 * sizeof(PIPE_CONTROL) + 2 * sizeof(MI_STORE_REGISTER_MEM) + sizeof(GPGPU_WALKER) + KernelCommandsHelper<FamilyType>::getSizeRequiredCS();
auto &commandStreamNDRangeKernel = getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*pCmdQ, true, false, nullptr);
auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, true, false, *pCmdQ, nullptr);
EXPECT_GE(expectedSizeCS, requiredSize);
EXPECT_GE(commandStreamNDRangeKernel.getAvailableSpace(), requiredSize);
auto &commandStreamTask = getCommandStream<FamilyType, CL_COMMAND_TASK>(*pCmdQ, true, false, nullptr);
expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_TASK, true, false, *pCmdQ, nullptr);
EXPECT_GE(expectedSizeCS, requiredSize);
EXPECT_GE(commandStreamTask.getAvailableSpace(), requiredSize);
}
HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GIVENCommandQueueWithProfilingAndForWorkloadWithNoKernelWHENGetCSFromCmdQueueTHENEnoughSpaceInCS) {
typedef typename FamilyType::MI_STORE_REGISTER_MEM MI_STORE_REGISTER_MEM;
typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL;
typedef typename FamilyType::GPGPU_WALKER GPGPU_WALKER;
uint64_t requiredSize = 2 * sizeof(PIPE_CONTROL) + 4 * sizeof(MI_STORE_REGISTER_MEM);
auto &commandStreamMigrateMemObjects = getCommandStream<FamilyType, CL_COMMAND_MIGRATE_MEM_OBJECTS>(*pCmdQ, true, false, nullptr);
auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_MIGRATE_MEM_OBJECTS, true, false, *pCmdQ, nullptr);
EXPECT_GE(expectedSizeCS, requiredSize);
EXPECT_GE(commandStreamMigrateMemObjects.getAvailableSpace(), requiredSize);
auto &commandStreamMarker = getCommandStream<FamilyType, CL_COMMAND_MARKER>(*pCmdQ, true, false, nullptr);
expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_MARKER, true, false, *pCmdQ, nullptr);
EXPECT_GE(expectedSizeCS, requiredSize);
EXPECT_GE(commandStreamMarker.getAvailableSpace(), requiredSize);
}
HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GIVENCommandQueueWithProfilingAndForWorkloadWithTwoKernelsInMdiWHENGetCSFromCmdQueueTHENEnoughSpaceInCS) {
typedef typename FamilyType::MI_STORE_REGISTER_MEM MI_STORE_REGISTER_MEM;
typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL;
typedef typename FamilyType::GPGPU_WALKER GPGPU_WALKER;
uint64_t requiredSize = 2 * sizeof(PIPE_CONTROL) + 4 * sizeof(MI_STORE_REGISTER_MEM) + KernelCommandsHelper<FamilyType>::getSizeRequiredCS();
requiredSize += 2 * sizeof(GPGPU_WALKER);
MockKernel kernel(program.get(), kernelInfo, *pDevice);
DispatchInfo dispatchInfo;
dispatchInfo.setKernel(&kernel);
MultiDispatchInfo multiDispatchInfo(dispatchInfo);
multiDispatchInfo.push(dispatchInfo);
auto &commandStreamTask = getCommandStream<FamilyType, CL_COMMAND_TASK>(*pCmdQ, true, false, nullptr);
auto expectedSizeCS = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(true, false, *pCmdQ, multiDispatchInfo);
EXPECT_GE(expectedSizeCS, requiredSize);
EXPECT_GE(commandStreamTask.getAvailableSpace(), requiredSize);
}
/*
# Two additional PIPE_CONTROLs are expected before first MI_STORE_REGISTER_MEM (which is before GPGPU_WALKER)
# and after second MI_STORE_REGISTER_MEM (which is after GPGPU_WALKER).
*/
HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GIVENCommandQueueWithProfolingWHENWalkerIsDispatchedTHENPipeControlWithTimeStampIsPresentInCS) {
typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL;
typedef typename FamilyType::GPGPU_WALKER GPGPU_WALKER;
MockKernel kernel(program.get(), kernelInfo, *pDevice);
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
size_t globalOffsets[3] = {0, 0, 0};
size_t workItems[3] = {1, 1, 1};
uint32_t dimensions = 1;
cl_event event;
cl_kernel clKernel = &kernel;
static_cast<CommandQueueHw<FamilyType> *>(pCmdQ)->enqueueKernel(
clKernel,
dimensions,
globalOffsets,
workItems,
nullptr,
0,
nullptr,
&event);
parseCommands<FamilyType>(*pCmdQ);
// Find GPGPU_WALKER
auto itorGPGPUWalkerCmd = find<GPGPU_WALKER *>(cmdList.begin(), cmdList.end());
GenCmdList::reverse_iterator rItorGPGPUWalkerCmd(itorGPGPUWalkerCmd);
ASSERT_NE(cmdList.end(), itorGPGPUWalkerCmd);
// Check PIPE_CONTROLs
auto itorBeforePC = reverse_find<PIPE_CONTROL *>(rItorGPGPUWalkerCmd, cmdList.rbegin());
ASSERT_NE(cmdList.rbegin(), itorBeforePC);
auto pBeforePC = genCmdCast<PIPE_CONTROL *>(*itorBeforePC);
ASSERT_NE(nullptr, pBeforePC);
EXPECT_EQ(1u, pBeforePC->getCommandStreamerStallEnable());
auto itorAfterPC = find<PIPE_CONTROL *>(itorGPGPUWalkerCmd, cmdList.end());
ASSERT_NE(cmdList.end(), itorAfterPC);
auto pAfterPC = genCmdCast<PIPE_CONTROL *>(*itorAfterPC);
ASSERT_NE(nullptr, pAfterPC);
EXPECT_EQ(1u, pAfterPC->getCommandStreamerStallEnable());
EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP, pBeforePC->getPostSyncOperation());
EXPECT_TRUE(static_cast<Event *>(event)->calcProfilingData());
clReleaseEvent(event);
}
/*
# One additional MI_STORE_REGISTER_MEM is expected before and after GPGPU_WALKER.
*/
HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GIVENCommandQueueWithProflingWHENWalkerIsDispatchedTHENMiStoreRegisterMemIsPresentInCS) {
typedef typename FamilyType::MI_STORE_REGISTER_MEM MI_STORE_REGISTER_MEM;
typedef typename FamilyType::GPGPU_WALKER GPGPU_WALKER;
MockKernel kernel(program.get(), kernelInfo, *pDevice);
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
size_t globalOffsets[3] = {0, 0, 0};
size_t workItems[3] = {1, 1, 1};
uint32_t dimensions = 1;
cl_event event;
static_cast<CommandQueueHw<FamilyType> *>(pCmdQ)->enqueueKernel(
&kernel,
dimensions,
globalOffsets,
workItems,
nullptr,
0,
nullptr,
&event);
parseCommands<FamilyType>(*pCmdQ);
// Find GPGPU_WALKER
auto itorGPGPUWalkerCmd = find<GPGPU_WALKER *>(cmdList.begin(), cmdList.end());
GenCmdList::reverse_iterator rItorGPGPUWalkerCmd(itorGPGPUWalkerCmd);
ASSERT_NE(cmdList.end(), itorGPGPUWalkerCmd);
// Check MI_STORE_REGISTER_MEMs
auto itorBeforeMI = reverse_find<MI_STORE_REGISTER_MEM *>(rItorGPGPUWalkerCmd, cmdList.rbegin());
ASSERT_NE(cmdList.rbegin(), itorBeforeMI);
auto pBeforeMI = genCmdCast<MI_STORE_REGISTER_MEM *>(*itorBeforeMI);
pBeforeMI = genCmdCast<MI_STORE_REGISTER_MEM *>(*itorBeforeMI);
ASSERT_NE(nullptr, pBeforeMI);
EXPECT_EQ(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, pBeforeMI->getRegisterAddress());
auto itorAfterMI = find<MI_STORE_REGISTER_MEM *>(itorGPGPUWalkerCmd, cmdList.end());
ASSERT_NE(cmdList.end(), itorAfterMI);
auto pAfterMI = genCmdCast<MI_STORE_REGISTER_MEM *>(*itorAfterMI);
ASSERT_NE(nullptr, pAfterMI);
EXPECT_EQ(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, pAfterMI->getRegisterAddress());
++itorAfterMI;
pAfterMI = genCmdCast<MI_STORE_REGISTER_MEM *>(*itorAfterMI);
EXPECT_EQ(nullptr, pAfterMI);
clReleaseEvent(event);
}
/*
# Two additional PIPE_CONTROLs are expected before first MI_STORE_REGISTER_MEM (which is before GPGPU_WALKER)
# and after second MI_STORE_REGISTER_MEM (which is after GPGPU_WALKER).
# If queue is blocked commands should be added to event
*/
HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GIVENCommandQueueBlockedWithProfilingWHENWalkerIsDispatchedTHENPipeControlWithTimeStampIsPresentInCS) {
typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL;
typedef typename FamilyType::GPGPU_WALKER GPGPU_WALKER;
MockKernel kernel(program.get(), kernelInfo, *pDevice);
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
size_t globalOffsets[3] = {0, 0, 0};
size_t workItems[3] = {1, 1, 1};
uint32_t dimensions = 1;
cl_event event;
cl_event ue = new UserEvent();
static_cast<CommandQueueHw<FamilyType> *>(pCmdQ)->enqueueKernel(
&kernel,
dimensions,
globalOffsets,
workItems,
nullptr,
1, // one user event to block queue
&ue, // user event not signaled
&event);
//rseCommands<FamilyType>(*pCmdQ);
ASSERT_NE(nullptr, pCmdQ->virtualEvent);
ASSERT_NE(nullptr, pCmdQ->virtualEvent->peekCommand());
OCLRT::LinearStream *eventCommandStream = pCmdQ->virtualEvent->peekCommand()->getCommandStream();
ASSERT_NE(nullptr, eventCommandStream);
parseCommands<FamilyType>(*eventCommandStream);
// Find GPGPU_WALKER
auto itorGPGPUWalkerCmd = find<GPGPU_WALKER *>(cmdList.begin(), cmdList.end());
GenCmdList::reverse_iterator rItorGPGPUWalkerCmd(itorGPGPUWalkerCmd);
ASSERT_NE(cmdList.end(), itorGPGPUWalkerCmd);
// Check PIPE_CONTROLs
auto itorBeforePC = reverse_find<PIPE_CONTROL *>(rItorGPGPUWalkerCmd, cmdList.rbegin());
ASSERT_NE(cmdList.rbegin(), itorBeforePC);
auto pBeforePC = genCmdCast<PIPE_CONTROL *>(*itorBeforePC);
ASSERT_NE(nullptr, pBeforePC);
auto itorAfterPC = find<PIPE_CONTROL *>(itorGPGPUWalkerCmd, cmdList.end());
ASSERT_NE(cmdList.end(), itorAfterPC);
auto pAfterPC = genCmdCast<PIPE_CONTROL *>(*itorAfterPC);
ASSERT_NE(nullptr, pAfterPC);
EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP, pBeforePC->getPostSyncOperation());
clReleaseEvent(event);
((UserEvent *)ue)->release();
}
/*
# One additional MI_STORE_REGISTER_MEM is expected before and after GPGPU_WALKER.
# If queue is blocked commands should be added to event
*/
HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GIVENCommandQueueBlockedWithProfilingWHENWalkerIsDispatchedTHENMiStoreRegisterMemIsPresentInCS) {
typedef typename FamilyType::MI_STORE_REGISTER_MEM MI_STORE_REGISTER_MEM;
typedef typename FamilyType::GPGPU_WALKER GPGPU_WALKER;
MockKernel kernel(program.get(), kernelInfo, *pDevice);
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
size_t globalOffsets[3] = {0, 0, 0};
size_t workItems[3] = {1, 1, 1};
uint32_t dimensions = 1;
cl_event event;
cl_event ue = new UserEvent();
static_cast<CommandQueueHw<FamilyType> *>(pCmdQ)->enqueueKernel(
&kernel,
dimensions,
globalOffsets,
workItems,
nullptr,
1, // one user event to block queue
&ue, // user event not signaled
&event);
// parseCommands<FamilyType>(*pCmdQ);
ASSERT_NE(nullptr, pCmdQ->virtualEvent);
ASSERT_NE(nullptr, pCmdQ->virtualEvent->peekCommand());
OCLRT::LinearStream *eventCommandStream = pCmdQ->virtualEvent->peekCommand()->getCommandStream();
ASSERT_NE(nullptr, eventCommandStream);
parseCommands<FamilyType>(*eventCommandStream);
// Find GPGPU_WALKER
auto itorGPGPUWalkerCmd = find<GPGPU_WALKER *>(cmdList.begin(), cmdList.end());
GenCmdList::reverse_iterator rItorGPGPUWalkerCmd(itorGPGPUWalkerCmd);
ASSERT_NE(cmdList.end(), itorGPGPUWalkerCmd);
// Check MI_STORE_REGISTER_MEMs
auto itorBeforeMI = reverse_find<MI_STORE_REGISTER_MEM *>(rItorGPGPUWalkerCmd, cmdList.rbegin());
ASSERT_NE(cmdList.rbegin(), itorBeforeMI);
auto pBeforeMI = genCmdCast<MI_STORE_REGISTER_MEM *>(*itorBeforeMI);
pBeforeMI = genCmdCast<MI_STORE_REGISTER_MEM *>(*itorBeforeMI);
ASSERT_NE(nullptr, pBeforeMI);
EXPECT_EQ(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, pBeforeMI->getRegisterAddress());
auto itorAfterMI = find<MI_STORE_REGISTER_MEM *>(itorGPGPUWalkerCmd, cmdList.end());
ASSERT_NE(cmdList.end(), itorAfterMI);
auto pAfterMI = genCmdCast<MI_STORE_REGISTER_MEM *>(*itorAfterMI);
ASSERT_NE(nullptr, pAfterMI);
EXPECT_EQ(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, pAfterMI->getRegisterAddress());
++itorAfterMI;
EXPECT_EQ(itorAfterMI, cmdList.end());
clReleaseEvent(event);
((UserEvent *)ue)->release();
}
HWTEST_F(ProfilingTests, givenNonKernelEnqueueWhenNonBlockedEnqueueThenSetCpuPath) {
cl_event event;
pCmdQ->enqueueMarkerWithWaitList(0, nullptr, &event);
auto eventObj = static_cast<Event *>(event);
EXPECT_TRUE(eventObj->isCPUProfilingPath() == CL_TRUE);
pCmdQ->finish(false);
uint64_t queued, submit, start, end;
cl_int retVal;
retVal = eventObj->getEventProfilingInfo(CL_PROFILING_COMMAND_QUEUED, sizeof(uint64_t), &queued, 0);
EXPECT_EQ(CL_SUCCESS, retVal);
retVal = eventObj->getEventProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, sizeof(uint64_t), &submit, 0);
EXPECT_EQ(CL_SUCCESS, retVal);
retVal = eventObj->getEventProfilingInfo(CL_PROFILING_COMMAND_START, sizeof(uint64_t), &start, 0);
EXPECT_EQ(CL_SUCCESS, retVal);
retVal = eventObj->getEventProfilingInfo(CL_PROFILING_COMMAND_END, sizeof(uint64_t), &end, 0);
EXPECT_EQ(CL_SUCCESS, retVal);
EXPECT_LT(0u, queued);
EXPECT_LT(queued, submit);
EXPECT_LT(submit, start);
EXPECT_LT(start, end);
eventObj->release();
}
template <typename TagType>
struct MockTagNode : public TagNode<TagType> {
public:
using TagNode<TagType>::tag;
using TagNode<TagType>::gfxAllocation;
MockTagNode() {
gfxAllocation = nullptr;
tag = nullptr;
}
};
class MyOSTime : public OSTime {
public:
static int instanceNum;
MyOSTime() {
instanceNum++;
}
double getDynamicDeviceTimerResolution(HardwareInfo const &hwInfo) const override {
EXPECT_FALSE(true);
return 1.0;
}
bool getCpuGpuTime(TimeStampData *pGpuCpuTime) override {
EXPECT_FALSE(true);
return false;
}
bool getCpuTime(uint64_t *timeStamp) override {
EXPECT_FALSE(true);
return false;
};
double getHostTimerResolution() const override {
EXPECT_FALSE(true);
return 0;
}
uint64_t getCpuRawTimestamp() override {
EXPECT_FALSE(true);
return 0;
}
};
int MyOSTime::instanceNum = 0;
TEST(EventProfilingTest, givenEventWhenCompleteIsZeroThenCalcProfilingDataSetsEndTimestampInCompleteTimestampAndDoesntCallOsTimeMethods) {
std::unique_ptr<MockDevice> device(MockDevice::createWithNewExecutionEnvironment<MockDevice>(nullptr));
MyOSTime::instanceNum = 0;
device->setOSTime(new MyOSTime());
EXPECT_EQ(1, MyOSTime::instanceNum);
MockContext context;
cl_command_queue_properties props[5] = {0, 0, 0, 0, 0};
MockCommandQueue cmdQ(&context, device.get(), props);
cmdQ.setProfilingEnabled();
cmdQ.device = device.get();
HwTimeStamps timestamp;
timestamp.GlobalStartTS = 10;
timestamp.ContextStartTS = 20;
timestamp.GlobalEndTS = 80;
timestamp.ContextEndTS = 56;
timestamp.GlobalCompleteTS = 0;
timestamp.ContextCompleteTS = 0;
MockTagNode<HwTimeStamps> timestampNode;
timestampNode.tag = &timestamp;
MockEvent<Event> event(&cmdQ, CL_COMPLETE, 0, 0);
event.setCPUProfilingPath(false);
event.timeStampNode = &timestampNode;
event.calcProfilingData();
EXPECT_EQ(timestamp.ContextEndTS, timestamp.ContextCompleteTS);
cmdQ.device = nullptr;
}
struct ProfilingWithPerfCountersTests : public ProfilingTests,
public PerformanceCountersFixture {
void SetUp() override {
PerformanceCountersFixture::SetUp();
ProfilingTests::SetUp();
createPerfCounters();
performanceCountersBase->initialize(platformDevices[0]);
pDevice->setPerfCounters(performanceCountersBase.release());
}
void TearDown() override {
ProfilingTests::TearDown();
PerformanceCountersFixture::TearDown();
}
};
HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingWithPerfCountersTests, GIVENCommandQueueWithProfilingPerfCounterAndForWorkloadWithKernelWHENGetCSFromCmdQueueTHENEnoughSpaceInCS) {
typedef typename FamilyType::MI_STORE_REGISTER_MEM MI_STORE_REGISTER_MEM;
typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL;
typedef typename FamilyType::GPGPU_WALKER GPGPU_WALKER;
typedef typename FamilyType::MI_REPORT_PERF_COUNT MI_REPORT_PERF_COUNT;
pCmdQ->setPerfCountersEnabled(true, 1);
uint64_t requiredSize = 2 * sizeof(PIPE_CONTROL) + 4 * sizeof(MI_STORE_REGISTER_MEM) + sizeof(GPGPU_WALKER) + KernelCommandsHelper<FamilyType>::getSizeRequiredCS();
//begin perf cmds
requiredSize += 2 * sizeof(PIPE_CONTROL) + 2 * sizeof(MI_STORE_REGISTER_MEM) + OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(MI_STORE_REGISTER_MEM) + sizeof(MI_REPORT_PERF_COUNT) + pCmdQ->getPerfCountersUserRegistersNumber() * sizeof(MI_STORE_REGISTER_MEM);
//end perf cmds
requiredSize += 2 * sizeof(PIPE_CONTROL) + 3 * sizeof(MI_STORE_REGISTER_MEM) + OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(MI_STORE_REGISTER_MEM) + sizeof(MI_REPORT_PERF_COUNT) + pCmdQ->getPerfCountersUserRegistersNumber() * sizeof(MI_STORE_REGISTER_MEM);
auto &commandStreamNDRangeKernel = getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*pCmdQ, true, true, nullptr);
auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, true, true, *pCmdQ, nullptr);
EXPECT_GE(expectedSizeCS, requiredSize);
EXPECT_GE(commandStreamNDRangeKernel.getAvailableSpace(), requiredSize);
auto &commandStreamTask = getCommandStream<FamilyType, CL_COMMAND_TASK>(*pCmdQ, true, true, nullptr);
expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_TASK, true, true, *pCmdQ, nullptr);
EXPECT_GE(expectedSizeCS, requiredSize);
EXPECT_GE(commandStreamTask.getAvailableSpace(), requiredSize);
bool retVal = false;
retVal = pCmdQ->setPerfCountersEnabled(false, UINT32_MAX);
EXPECT_TRUE(retVal);
retVal = pCmdQ->setPerfCountersEnabled(false, UINT32_MAX);
EXPECT_TRUE(retVal);
}
HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingWithPerfCountersTests, GIVENCommandQueueWithProfilingPerfCounterAndForWorkloadWithNoKernelWHENGetCSFromCmdQueueTHENEnoughSpaceInCS) {
typedef typename FamilyType::MI_STORE_REGISTER_MEM MI_STORE_REGISTER_MEM;
typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL;
typedef typename FamilyType::GPGPU_WALKER GPGPU_WALKER;
pCmdQ->setPerfCountersEnabled(true, 1);
uint64_t requiredSize = 2 * sizeof(PIPE_CONTROL) + 4 * sizeof(MI_STORE_REGISTER_MEM);
auto &commandStreamMigrateMemObjects = getCommandStream<FamilyType, CL_COMMAND_MIGRATE_MEM_OBJECTS>(*pCmdQ, true, true, nullptr);
auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_MIGRATE_MEM_OBJECTS, true, true, *pCmdQ, nullptr);
EXPECT_GE(expectedSizeCS, requiredSize);
EXPECT_GE(commandStreamMigrateMemObjects.getAvailableSpace(), requiredSize);
auto &commandStreamMarker = getCommandStream<FamilyType, CL_COMMAND_MARKER>(*pCmdQ, true, true, nullptr);
expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_MARKER, true, true, *pCmdQ, nullptr);
EXPECT_GE(expectedSizeCS, requiredSize);
EXPECT_GE(commandStreamMarker.getAvailableSpace(), requiredSize);
pCmdQ->setPerfCountersEnabled(false, UINT32_MAX);
}
HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingWithPerfCountersTests, GIVENCommandQueueWithProfilingPerfCountersAndForWorkloadWithTwoKernelsInMdiWHENGetCSFromCmdQueueTHENEnoughSpaceInCS) {
typedef typename FamilyType::MI_STORE_REGISTER_MEM MI_STORE_REGISTER_MEM;
typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL;
typedef typename FamilyType::GPGPU_WALKER GPGPU_WALKER;
typedef typename FamilyType::MI_REPORT_PERF_COUNT MI_REPORT_PERF_COUNT;
pCmdQ->setPerfCountersEnabled(true, 1);
uint64_t requiredSize = 2 * sizeof(PIPE_CONTROL) + 4 * sizeof(MI_STORE_REGISTER_MEM) + KernelCommandsHelper<FamilyType>::getSizeRequiredCS();
requiredSize += 2 * sizeof(GPGPU_WALKER);
//begin perf cmds
requiredSize += 2 * sizeof(PIPE_CONTROL) + 2 * sizeof(MI_STORE_REGISTER_MEM) + OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(MI_STORE_REGISTER_MEM) + sizeof(MI_REPORT_PERF_COUNT) + pCmdQ->getPerfCountersUserRegistersNumber() * sizeof(MI_STORE_REGISTER_MEM);
//end perf cmds
requiredSize += 2 * sizeof(PIPE_CONTROL) + 3 * sizeof(MI_STORE_REGISTER_MEM) + OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(MI_STORE_REGISTER_MEM) + sizeof(MI_REPORT_PERF_COUNT) + pCmdQ->getPerfCountersUserRegistersNumber() * sizeof(MI_STORE_REGISTER_MEM);
MockKernel kernel(program.get(), kernelInfo, *pDevice);
DispatchInfo dispatchInfo;
dispatchInfo.setKernel(&kernel);
MultiDispatchInfo multiDispatchInfo(dispatchInfo);
multiDispatchInfo.push(dispatchInfo);
auto &commandStreamTask = getCommandStream<FamilyType, CL_COMMAND_TASK>(*pCmdQ, true, true, nullptr);
auto expectedSizeCS = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(true, true, *pCmdQ, multiDispatchInfo);
EXPECT_GE(expectedSizeCS, requiredSize);
EXPECT_GE(commandStreamTask.getAvailableSpace(), requiredSize);
pCmdQ->setPerfCountersEnabled(false, UINT32_MAX);
}
HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingWithPerfCountersTests, GIVENCommandQueueWithProfilingPerfCountersWHENWalkerIsDispatchedTHENPipeControlWithTimeStampIsPresentInCS) {
typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL;
typedef typename FamilyType::GPGPU_WALKER GPGPU_WALKER;
typedef typename FamilyType::MI_REPORT_PERF_COUNT MI_REPORT_PERF_COUNT;
pCmdQ->setPerfCountersEnabled(true, 1);
MockKernel kernel(program.get(), kernelInfo, *pDevice);
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
size_t globalOffsets[3] = {0, 0, 0};
size_t workItems[3] = {1, 1, 1};
uint32_t dimensions = 1;
cl_event event;
cl_kernel clKernel = &kernel;
static_cast<CommandQueueHw<FamilyType> *>(pCmdQ)->enqueueKernel(
clKernel,
dimensions,
globalOffsets,
workItems,
nullptr,
0,
nullptr,
&event);
parseCommands<FamilyType>(*pCmdQ);
// expect MI_REPORT_PERF_COUNT before WALKER
auto itorBeforeReportPerf = find<MI_REPORT_PERF_COUNT *>(cmdList.begin(), cmdList.end());
ASSERT_NE(cmdList.end(), itorBeforeReportPerf);
// Find GPGPU_WALKER
auto itorGPGPUWalkerCmd = find<GPGPU_WALKER *>(itorBeforeReportPerf, cmdList.end());
GenCmdList::reverse_iterator rItorGPGPUWalkerCmd(itorGPGPUWalkerCmd);
ASSERT_NE(cmdList.end(), itorGPGPUWalkerCmd);
// Check PIPE_CONTROLs
auto itorBeforePC = reverse_find<PIPE_CONTROL *>(rItorGPGPUWalkerCmd, cmdList.rbegin());
ASSERT_NE(cmdList.rbegin(), itorBeforePC);
auto pBeforePC = genCmdCast<PIPE_CONTROL *>(*itorBeforePC);
ASSERT_NE(nullptr, pBeforePC);
EXPECT_EQ(1u, pBeforePC->getCommandStreamerStallEnable());
auto itorAfterPC = find<PIPE_CONTROL *>(itorGPGPUWalkerCmd, cmdList.end());
ASSERT_NE(cmdList.end(), itorAfterPC);
auto pAfterPC = genCmdCast<PIPE_CONTROL *>(*itorAfterPC);
ASSERT_NE(nullptr, pAfterPC);
EXPECT_EQ(1u, pAfterPC->getCommandStreamerStallEnable());
EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP, pBeforePC->getPostSyncOperation());
// expect MI_REPORT_PERF_COUNT after WALKER
auto itorAfterReportPerf = find<MI_REPORT_PERF_COUNT *>(itorGPGPUWalkerCmd, cmdList.end());
ASSERT_NE(cmdList.end(), itorAfterReportPerf);
EXPECT_TRUE(static_cast<Event *>(event)->calcProfilingData());
clReleaseEvent(event);
pCmdQ->setPerfCountersEnabled(false, UINT32_MAX);
}
HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingWithPerfCountersTests, GIVENCommandQueueWithProfilingPerfCountersNoUserRegistersWHENWalkerIsDispatchedTHENPipeControlWithTimeStampIsPresentInCS) {
typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL;
typedef typename FamilyType::GPGPU_WALKER GPGPU_WALKER;
typedef typename FamilyType::MI_REPORT_PERF_COUNT MI_REPORT_PERF_COUNT;
pCmdQ->setPerfCountersEnabled(true, 2);
MockKernel kernel(program.get(), kernelInfo, *pDevice);
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
size_t globalOffsets[3] = {0, 0, 0};
size_t workItems[3] = {1, 1, 1};
uint32_t dimensions = 1;
cl_event event;
cl_kernel clKernel = &kernel;
static_cast<CommandQueueHw<FamilyType> *>(pCmdQ)->enqueueKernel(
clKernel,
dimensions,
globalOffsets,
workItems,
nullptr,
0,
nullptr,
&event);
parseCommands<FamilyType>(*pCmdQ);
// expect MI_REPORT_PERF_COUNT before WALKER
auto itorBeforeReportPerf = find<MI_REPORT_PERF_COUNT *>(cmdList.begin(), cmdList.end());
ASSERT_NE(cmdList.end(), itorBeforeReportPerf);
// Find GPGPU_WALKER
auto itorGPGPUWalkerCmd = find<GPGPU_WALKER *>(itorBeforeReportPerf, cmdList.end());
GenCmdList::reverse_iterator rItorGPGPUWalkerCmd(itorGPGPUWalkerCmd);
ASSERT_NE(cmdList.end(), itorGPGPUWalkerCmd);
// Check PIPE_CONTROLs
auto itorBeforePC = reverse_find<PIPE_CONTROL *>(rItorGPGPUWalkerCmd, cmdList.rbegin());
ASSERT_NE(cmdList.rbegin(), itorBeforePC);
auto pBeforePC = genCmdCast<PIPE_CONTROL *>(*itorBeforePC);
ASSERT_NE(nullptr, pBeforePC);
EXPECT_EQ(1u, pBeforePC->getCommandStreamerStallEnable());
auto itorAfterPC = find<PIPE_CONTROL *>(itorGPGPUWalkerCmd, cmdList.end());
ASSERT_NE(cmdList.end(), itorAfterPC);
auto pAfterPC = genCmdCast<PIPE_CONTROL *>(*itorAfterPC);
ASSERT_NE(nullptr, pAfterPC);
EXPECT_EQ(1u, pAfterPC->getCommandStreamerStallEnable());
EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP, pBeforePC->getPostSyncOperation());
// expect MI_REPORT_PERF_COUNT after WALKER
auto itorAfterReportPerf = find<MI_REPORT_PERF_COUNT *>(itorGPGPUWalkerCmd, cmdList.end());
ASSERT_NE(cmdList.end(), itorAfterReportPerf);
EXPECT_TRUE(static_cast<Event *>(event)->calcProfilingData());
clReleaseEvent(event);
pCmdQ->setPerfCountersEnabled(false, UINT32_MAX);
}
HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingWithPerfCountersTests, GIVENCommandQueueBlockedWithProflingPerfCounterWHENWalkerIsDispatchedTHENPipeControlWithTimeStampIsPresentInCS) {
typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL;
typedef typename FamilyType::GPGPU_WALKER GPGPU_WALKER;
typedef typename FamilyType::MI_REPORT_PERF_COUNT MI_REPORT_PERF_COUNT;
pCmdQ->setPerfCountersEnabled(true, 1);
MockKernel kernel(program.get(), kernelInfo, *pDevice);
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
size_t globalOffsets[3] = {0, 0, 0};
size_t workItems[3] = {1, 1, 1};
uint32_t dimensions = 1;
cl_event event;
cl_event ue = new UserEvent();
static_cast<CommandQueueHw<FamilyType> *>(pCmdQ)->enqueueKernel(
&kernel,
dimensions,
globalOffsets,
workItems,
nullptr,
1, // one user event to block queue
&ue, // user event not signaled
&event);
//rseCommands<FamilyType>(*pCmdQ);
ASSERT_NE(nullptr, pCmdQ->virtualEvent);
ASSERT_NE(nullptr, pCmdQ->virtualEvent->peekCommand());
OCLRT::LinearStream *eventCommandStream = pCmdQ->virtualEvent->peekCommand()->getCommandStream();
ASSERT_NE(nullptr, eventCommandStream);
parseCommands<FamilyType>(*eventCommandStream);
// expect MI_REPORT_PERF_COUNT before WALKER
auto itorBeforeReportPerf = find<MI_REPORT_PERF_COUNT *>(cmdList.begin(), cmdList.end());
ASSERT_NE(cmdList.end(), itorBeforeReportPerf);
// Find GPGPU_WALKER
auto itorGPGPUWalkerCmd = find<GPGPU_WALKER *>(itorBeforeReportPerf, cmdList.end());
GenCmdList::reverse_iterator rItorGPGPUWalkerCmd(itorGPGPUWalkerCmd);
ASSERT_NE(cmdList.end(), itorGPGPUWalkerCmd);
// Check PIPE_CONTROLs
auto itorBeforePC = reverse_find<PIPE_CONTROL *>(rItorGPGPUWalkerCmd, cmdList.rbegin());
ASSERT_NE(cmdList.rbegin(), itorBeforePC);
auto pBeforePC = genCmdCast<PIPE_CONTROL *>(*itorBeforePC);
ASSERT_NE(nullptr, pBeforePC);
auto itorAfterPC = find<PIPE_CONTROL *>(itorGPGPUWalkerCmd, cmdList.end());
ASSERT_NE(cmdList.end(), itorAfterPC);
auto pAfterPC = genCmdCast<PIPE_CONTROL *>(*itorAfterPC);
ASSERT_NE(nullptr, pAfterPC);
EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP, pBeforePC->getPostSyncOperation());
// expect MI_REPORT_PERF_COUNT after WALKER
auto itorAfterReportPerf = find<MI_REPORT_PERF_COUNT *>(itorGPGPUWalkerCmd, cmdList.end());
ASSERT_NE(cmdList.end(), itorAfterReportPerf);
clReleaseEvent(event);
((UserEvent *)ue)->release();
pCmdQ->setPerfCountersEnabled(false, UINT32_MAX);
}
HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingWithPerfCountersTests, GIVENCommandQueueWithProfilingPerfCountersNoEventWHENWalkerIsDispatchedTHENPipeControlWithTimeStampIsNotPresentInCS) {
typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL;
typedef typename FamilyType::GPGPU_WALKER GPGPU_WALKER;
typedef typename FamilyType::MI_REPORT_PERF_COUNT MI_REPORT_PERF_COUNT;
pCmdQ->setPerfCountersEnabled(true, 1);
MockKernel kernel(program.get(), kernelInfo, *pDevice);
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
size_t globalOffsets[3] = {0, 0, 0};
size_t workItems[3] = {1, 1, 1};
uint32_t dimensions = 1;
cl_kernel clKernel = &kernel;
static_cast<CommandQueueHw<FamilyType> *>(pCmdQ)->enqueueKernel(
clKernel,
dimensions,
globalOffsets,
workItems,
nullptr,
0,
nullptr,
nullptr);
parseCommands<FamilyType>(*pCmdQ);
// expect no MI_REPORT_PERF_COUNT before WALKER
auto itorBeforeReportPerf = find<MI_REPORT_PERF_COUNT *>(cmdList.begin(), cmdList.end());
ASSERT_EQ(cmdList.end(), itorBeforeReportPerf);
// Find GPGPU_WALKER
auto itorGPGPUWalkerCmd = find<GPGPU_WALKER *>(cmdList.begin(), cmdList.end());
GenCmdList::reverse_iterator rItorGPGPUWalkerCmd(itorGPGPUWalkerCmd);
ASSERT_NE(cmdList.end(), itorGPGPUWalkerCmd);
// Check PIPE_CONTROLs
auto itorBeforePC = reverse_find<PIPE_CONTROL *>(rItorGPGPUWalkerCmd, cmdList.rbegin());
ASSERT_NE(cmdList.rbegin(), itorBeforePC);
auto pBeforePC = genCmdCast<PIPE_CONTROL *>(*itorBeforePC);
ASSERT_NE(nullptr, pBeforePC);
EXPECT_EQ(1u, pBeforePC->getCommandStreamerStallEnable());
auto itorAfterPC = find<PIPE_CONTROL *>(itorGPGPUWalkerCmd, cmdList.end());
ASSERT_NE(cmdList.end(), itorAfterPC);
auto pAfterPC = genCmdCast<PIPE_CONTROL *>(*itorAfterPC);
ASSERT_NE(nullptr, pAfterPC);
EXPECT_EQ(1u, pAfterPC->getCommandStreamerStallEnable());
EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION_NO_WRITE, pBeforePC->getPostSyncOperation());
// expect MI_REPORT_PERF_COUNT after WALKER
auto itorAfterReportPerf = find<MI_REPORT_PERF_COUNT *>(itorGPGPUWalkerCmd, cmdList.end());
ASSERT_EQ(cmdList.end(), itorAfterReportPerf);
pCmdQ->setPerfCountersEnabled(false, UINT32_MAX);
}
} // namespace OCLRT