Use suggest groupSize in appendQueryKernelTimestamps

Change-Id: Ic4f0a5a47dfbf564b61baa2fbb2c03c0b5db4b14
This commit is contained in:
Kamil Diedrich
2020-08-18 16:48:29 +02:00
committed by sys_ocldev
parent b6cad3c206
commit 62428505c5
2 changed files with 222 additions and 7 deletions

View File

@@ -1464,7 +1464,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendQueryKernelTimestamps(
auto dstptrAllocationStruct = getAlignedAllocation(this->device, dstptr, sizeof(ze_kernel_timestamp_result_t) * numEvents);
commandContainer.addToResidencyContainer(dstptrAllocationStruct.alloc);
uint64_t *timestampsAddress = new uint64_t[numEvents];
std::unique_ptr<uint64_t[]> timestampsAddress = std::make_unique<uint64_t[]>(numEvents);
for (uint32_t i = 0u; i < numEvents; ++i) {
auto event = Event::fromHandle(phEvents[i]);
@@ -1488,7 +1488,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendQueryKernelTimestamps(
commandContainer.addToResidencyContainer(timestampsGPUAddress);
commandContainer.getDeallocationContainer().push_back(timestampsGPUAddress);
bool result = device->getDriverHandle()->getMemoryManager()->copyMemoryToAllocation(timestampsGPUAddress, timestampsAddress, sizeof(uint64_t) * numEvents);
bool result = device->getDriverHandle()->getMemoryManager()->copyMemoryToAllocation(timestampsGPUAddress, timestampsAddress.get(), sizeof(uint64_t) * numEvents);
UNRECOVERABLE_IF(!result);
@@ -1507,8 +1507,22 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendQueryKernelTimestamps(
offsetValPtr += sizeof(size_t);
}
ze_group_count_t dispatchFuncArgs{1, 1, 1};
builtinFunction->setGroupSize(numEvents, 1, 1);
uint32_t groupSizeX = 1u;
uint32_t groupSizeY = 1u;
uint32_t groupSizeZ = 1u;
if (builtinFunction->suggestGroupSize(numEvents, 1u, 1u,
&groupSizeX, &groupSizeY, &groupSizeZ) != ZE_RESULT_SUCCESS) {
DEBUG_BREAK_IF(true);
return ZE_RESULT_ERROR_UNKNOWN;
}
if (builtinFunction->setGroupSize(groupSizeX, groupSizeY, groupSizeZ) != ZE_RESULT_SUCCESS) {
DEBUG_BREAK_IF(true);
return ZE_RESULT_ERROR_UNKNOWN;
}
ze_group_count_t dispatchFuncArgs{numEvents / groupSizeX, 1u, 1u};
auto dstValPtr = static_cast<uintptr_t>(dstptrAllocationStruct.alloc->getGpuAddress());
@@ -1518,7 +1532,6 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendQueryKernelTimestamps(
auto appendResult = appendLaunchKernel(builtinFunction->toHandle(), &dispatchFuncArgs, hSignalEvent, numWaitEvents,
phWaitEvents);
if (appendResult != ZE_RESULT_SUCCESS) {
delete[] timestampsAddress;
return appendResult;
}
@@ -1526,8 +1539,6 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendQueryKernelTimestamps(
CommandListCoreFamily<gfxCoreFamily>::appendSignalEvent(hSignalEvent);
}
delete[] timestampsAddress;
return ZE_RESULT_SUCCESS;
}

View File

@@ -12,7 +12,9 @@
#include "opencl/test/unit_test/mocks/mock_graphics_allocation.h"
#include "test.h"
#include "level_zero/core/source/builtin/builtin_functions_lib_impl.h"
#include "level_zero/core/source/cmdqueue/cmdqueue_imp.h"
#include "level_zero/core/source/kernel/kernel_imp.h"
#include "level_zero/core/test/unit_tests/fixtures/device_fixture.h"
#include "level_zero/core/test/unit_tests/mocks/mock_cmdlist.h"
#include "level_zero/core/test/unit_tests/mocks/mock_event.h"
@@ -492,6 +494,208 @@ HWTEST2_F(AppendQueryKernelTimestamps, givenCommandListWhenAppendQueryKernelTime
driverHandle->freeMem(offsetAlloc);
}
HWTEST2_F(AppendQueryKernelTimestamps, givenCommandListWhenAppendQueryKernelTimestampsWithEventsNumberBiggerThanMaxWorkItemSizeThenProperGroupSizeAndGroupCountIsSet, TestPlatforms) {
MockCommandListForAppendLaunchKernel<gfxCoreFamily> commandList;
commandList.initialize(device, false);
device->getBuiltinFunctionsLib()->initFunctions();
MockEvent event;
event.waitScope = ZE_EVENT_SCOPE_FLAG_HOST;
event.signalScope = ZE_EVENT_SCOPE_FLAG_HOST;
void *alloc;
auto result = driverHandle->allocDeviceMem(device, ZE_DEVICE_MEM_ALLOC_FLAG_FORCE_UINT32, 128, 1, &alloc);
EXPECT_EQ(result, ZE_RESULT_SUCCESS);
size_t eventCount = device->getNEODevice()->getDeviceInfo().maxWorkItemSizes[0] * 2u;
std::unique_ptr<ze_event_handle_t[]> events = std::make_unique<ze_event_handle_t[]>(eventCount);
for (size_t i = 0u; i < eventCount; ++i) {
events[i] = event.toHandle();
}
result = commandList.appendQueryKernelTimestamps(static_cast<uint32_t>(eventCount), events.get(), alloc, nullptr, nullptr, 0u, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(device->getBuiltinFunctionsLib()->getFunction(Builtin::QueryKernelTimestamps)->getIsaAllocation()->getGpuAddress(), commandList.cmdListHelper.isaAllocation->getGpuAddress());
uint32_t groupSizeX = static_cast<uint32_t>(eventCount);
uint32_t groupSizeY = 1u;
uint32_t groupSizeZ = 1u;
device->getBuiltinFunctionsLib()->getFunction(Builtin::QueryKernelTimestamps)->suggestGroupSize(groupSizeX, groupSizeY, groupSizeZ, &groupSizeX, &groupSizeY, &groupSizeZ);
EXPECT_EQ(groupSizeX, commandList.cmdListHelper.groupSize[0]);
EXPECT_EQ(groupSizeY, commandList.cmdListHelper.groupSize[1]);
EXPECT_EQ(groupSizeZ, commandList.cmdListHelper.groupSize[2]);
EXPECT_EQ(static_cast<uint32_t>(eventCount) / groupSizeX, commandList.cmdListHelper.threadGroupDimensions.groupCountX);
EXPECT_EQ(1u, commandList.cmdListHelper.threadGroupDimensions.groupCountY);
EXPECT_EQ(1u, commandList.cmdListHelper.threadGroupDimensions.groupCountZ);
driverHandle->freeMem(alloc);
}
HWTEST2_F(AppendQueryKernelTimestamps, givenCommandListWhenAppendQueryKernelTimestampsAndInvalidResultSuggestGroupSizeThanUnknownResultReturned, TestPlatforms) {
class MockQueryKernelTimestampsKernel : public L0::KernelImp {
public:
ze_result_t suggestGroupSize(uint32_t globalSizeX, uint32_t globalSizeY,
uint32_t globalSizeZ, uint32_t *groupSizeX,
uint32_t *groupSizeY, uint32_t *groupSizeZ) override {
return ZE_RESULT_ERROR_UNKNOWN;
}
void setBufferSurfaceState(uint32_t argIndex, void *address, NEO::GraphicsAllocation *alloc) override {
return;
}
void evaluateIfRequiresGenerationOfLocalIdsByRuntime(const NEO::KernelDescriptor &kernelDescriptor) override {
return;
}
std::unique_ptr<Kernel> clone() const override { return nullptr; }
};
struct MockBuiltinFunctionsLibImpl : BuiltinFunctionsLibImpl {
using BuiltinFunctionsLibImpl::builtins;
using BuiltinFunctionsLibImpl::getFunction;
using BuiltinFunctionsLibImpl::imageBuiltins;
MockBuiltinFunctionsLibImpl(L0::Device *device, NEO::BuiltIns *builtInsLib) : BuiltinFunctionsLibImpl(device, builtInsLib) {}
};
struct MockBuiltinFunctionsForQueryKernelTimestamps : BuiltinFunctionsLibImpl {
MockBuiltinFunctionsForQueryKernelTimestamps(L0::Device *device, NEO::BuiltIns *builtInsLib) : BuiltinFunctionsLibImpl(device, builtInsLib) {
tmpMockKernel = new MockQueryKernelTimestampsKernel;
}
MockQueryKernelTimestampsKernel *getFunction(Builtin func) override {
return tmpMockKernel;
}
~MockBuiltinFunctionsForQueryKernelTimestamps() override {
delete tmpMockKernel;
}
MockQueryKernelTimestampsKernel *tmpMockKernel = nullptr;
};
class MockDeviceHandle : public L0::DeviceImp {
public:
MockDeviceHandle() {
}
void initialize(L0::Device *device) {
neoDevice = device->getNEODevice();
neoDevice->incRefInternal();
execEnvironment = device->getExecEnvironment();
driverHandle = device->getDriverHandle();
tmpMockBultinLib = new MockBuiltinFunctionsForQueryKernelTimestamps{nullptr, nullptr};
}
MockBuiltinFunctionsForQueryKernelTimestamps *getBuiltinFunctionsLib() override {
return tmpMockBultinLib;
}
~MockDeviceHandle() override {
delete tmpMockBultinLib;
}
MockBuiltinFunctionsForQueryKernelTimestamps *tmpMockBultinLib = nullptr;
};
MockDeviceHandle mockDevice;
mockDevice.initialize(device);
MockCommandListForAppendLaunchKernel<gfxCoreFamily> commandList;
commandList.initialize(&mockDevice, false);
MockEvent event;
ze_event_handle_t events[2] = {event.toHandle(), event.toHandle()};
event.waitScope = ZE_EVENT_SCOPE_FLAG_HOST;
event.signalScope = ZE_EVENT_SCOPE_FLAG_HOST;
void *alloc;
auto result = driverHandle->allocDeviceMem(&mockDevice, ZE_DEVICE_MEM_ALLOC_FLAG_FORCE_UINT32, 128, 1, &alloc);
EXPECT_EQ(result, ZE_RESULT_SUCCESS);
result = commandList.appendQueryKernelTimestamps(2u, events, alloc, nullptr, nullptr, 0u, nullptr);
EXPECT_EQ(ZE_RESULT_ERROR_UNKNOWN, result);
driverHandle->freeMem(alloc);
}
HWTEST2_F(AppendQueryKernelTimestamps, givenCommandListWhenAppendQueryKernelTimestampsAndInvalidResultSetGroupSizeThanUnknownResultReturned, TestPlatforms) {
class MockQueryKernelTimestampsKernel : public L0::KernelImp {
public:
ze_result_t suggestGroupSize(uint32_t globalSizeX, uint32_t globalSizeY,
uint32_t globalSizeZ, uint32_t *groupSizeX,
uint32_t *groupSizeY, uint32_t *groupSizeZ) override {
*groupSizeX = static_cast<uint32_t>(1u);
*groupSizeY = static_cast<uint32_t>(1u);
*groupSizeZ = static_cast<uint32_t>(1u);
return ZE_RESULT_SUCCESS;
}
ze_result_t setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
uint32_t groupSizeZ) override {
return ZE_RESULT_ERROR_UNKNOWN;
}
void setBufferSurfaceState(uint32_t argIndex, void *address, NEO::GraphicsAllocation *alloc) override {
return;
}
void evaluateIfRequiresGenerationOfLocalIdsByRuntime(const NEO::KernelDescriptor &kernelDescriptor) override {
return;
}
std::unique_ptr<Kernel> clone() const override { return nullptr; }
};
struct MockBuiltinFunctionsLibImpl : BuiltinFunctionsLibImpl {
using BuiltinFunctionsLibImpl::builtins;
using BuiltinFunctionsLibImpl::getFunction;
using BuiltinFunctionsLibImpl::imageBuiltins;
MockBuiltinFunctionsLibImpl(L0::Device *device, NEO::BuiltIns *builtInsLib) : BuiltinFunctionsLibImpl(device, builtInsLib) {}
};
struct MockBuiltinFunctionsForQueryKernelTimestamps : BuiltinFunctionsLibImpl {
MockBuiltinFunctionsForQueryKernelTimestamps(L0::Device *device, NEO::BuiltIns *builtInsLib) : BuiltinFunctionsLibImpl(device, builtInsLib) {
tmpMockKernel = new MockQueryKernelTimestampsKernel;
}
MockQueryKernelTimestampsKernel *getFunction(Builtin func) override {
return tmpMockKernel;
}
~MockBuiltinFunctionsForQueryKernelTimestamps() override {
delete tmpMockKernel;
}
MockQueryKernelTimestampsKernel *tmpMockKernel = nullptr;
};
class MockDeviceHandle : public L0::DeviceImp {
public:
MockDeviceHandle() {
}
void initialize(L0::Device *device) {
neoDevice = device->getNEODevice();
neoDevice->incRefInternal();
execEnvironment = device->getExecEnvironment();
driverHandle = device->getDriverHandle();
tmpMockBultinLib = new MockBuiltinFunctionsForQueryKernelTimestamps{nullptr, nullptr};
}
MockBuiltinFunctionsForQueryKernelTimestamps *getBuiltinFunctionsLib() override {
return tmpMockBultinLib;
}
~MockDeviceHandle() override {
delete tmpMockBultinLib;
}
MockBuiltinFunctionsForQueryKernelTimestamps *tmpMockBultinLib = nullptr;
};
MockDeviceHandle mockDevice;
mockDevice.initialize(device);
MockCommandListForAppendLaunchKernel<gfxCoreFamily> commandList;
commandList.initialize(&mockDevice, false);
MockEvent event;
ze_event_handle_t events[2] = {event.toHandle(), event.toHandle()};
event.waitScope = ZE_EVENT_SCOPE_FLAG_HOST;
event.signalScope = ZE_EVENT_SCOPE_FLAG_HOST;
void *alloc;
auto result = driverHandle->allocDeviceMem(&mockDevice, ZE_DEVICE_MEM_ALLOC_FLAG_FORCE_UINT32, 128, 1, &alloc);
EXPECT_EQ(result, ZE_RESULT_SUCCESS);
result = commandList.appendQueryKernelTimestamps(2u, events, alloc, nullptr, nullptr, 0u, nullptr);
EXPECT_EQ(ZE_RESULT_ERROR_UNKNOWN, result);
driverHandle->freeMem(alloc);
}
HWTEST_F(CommandListCreate, givenCommandListWithCopyOnlyWhenAppendSignalEventThenMiFlushDWIsProgrammed) {
using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW;
std::unique_ptr<L0::CommandList> commandList(CommandList::create(productFamily, device, true));