From 62428505c560d0b5c727bbafe8322c9eeeab35fc Mon Sep 17 00:00:00 2001 From: Kamil Diedrich Date: Tue, 18 Aug 2020 16:48:29 +0200 Subject: [PATCH] Use suggest groupSize in appendQueryKernelTimestamps Change-Id: Ic4f0a5a47dfbf564b61baa2fbb2c03c0b5db4b14 --- level_zero/core/source/cmdlist/cmdlist_hw.inl | 25 ++- .../sources/cmdlist/test_cmdlist_1.cpp | 204 ++++++++++++++++++ 2 files changed, 222 insertions(+), 7 deletions(-) diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index 2b1711f073..d5e5188d8d 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -1464,7 +1464,7 @@ ze_result_t CommandListCoreFamily::appendQueryKernelTimestamps( auto dstptrAllocationStruct = getAlignedAllocation(this->device, dstptr, sizeof(ze_kernel_timestamp_result_t) * numEvents); commandContainer.addToResidencyContainer(dstptrAllocationStruct.alloc); - uint64_t *timestampsAddress = new uint64_t[numEvents]; + std::unique_ptr timestampsAddress = std::make_unique(numEvents); for (uint32_t i = 0u; i < numEvents; ++i) { auto event = Event::fromHandle(phEvents[i]); @@ -1488,7 +1488,7 @@ ze_result_t CommandListCoreFamily::appendQueryKernelTimestamps( commandContainer.addToResidencyContainer(timestampsGPUAddress); commandContainer.getDeallocationContainer().push_back(timestampsGPUAddress); - bool result = device->getDriverHandle()->getMemoryManager()->copyMemoryToAllocation(timestampsGPUAddress, timestampsAddress, sizeof(uint64_t) * numEvents); + bool result = device->getDriverHandle()->getMemoryManager()->copyMemoryToAllocation(timestampsGPUAddress, timestampsAddress.get(), sizeof(uint64_t) * numEvents); UNRECOVERABLE_IF(!result); @@ -1507,8 +1507,22 @@ ze_result_t CommandListCoreFamily::appendQueryKernelTimestamps( offsetValPtr += sizeof(size_t); } - ze_group_count_t dispatchFuncArgs{1, 1, 1}; - builtinFunction->setGroupSize(numEvents, 1, 1); + uint32_t groupSizeX = 1u; + uint32_t groupSizeY = 1u; + uint32_t groupSizeZ = 1u; + + if (builtinFunction->suggestGroupSize(numEvents, 1u, 1u, + &groupSizeX, &groupSizeY, &groupSizeZ) != ZE_RESULT_SUCCESS) { + DEBUG_BREAK_IF(true); + return ZE_RESULT_ERROR_UNKNOWN; + } + + if (builtinFunction->setGroupSize(groupSizeX, groupSizeY, groupSizeZ) != ZE_RESULT_SUCCESS) { + DEBUG_BREAK_IF(true); + return ZE_RESULT_ERROR_UNKNOWN; + } + + ze_group_count_t dispatchFuncArgs{numEvents / groupSizeX, 1u, 1u}; auto dstValPtr = static_cast(dstptrAllocationStruct.alloc->getGpuAddress()); @@ -1518,7 +1532,6 @@ ze_result_t CommandListCoreFamily::appendQueryKernelTimestamps( auto appendResult = appendLaunchKernel(builtinFunction->toHandle(), &dispatchFuncArgs, hSignalEvent, numWaitEvents, phWaitEvents); if (appendResult != ZE_RESULT_SUCCESS) { - delete[] timestampsAddress; return appendResult; } @@ -1526,8 +1539,6 @@ ze_result_t CommandListCoreFamily::appendQueryKernelTimestamps( CommandListCoreFamily::appendSignalEvent(hSignalEvent); } - delete[] timestampsAddress; - return ZE_RESULT_SUCCESS; } diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp index fe36a7a122..c0f659f01f 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp @@ -12,7 +12,9 @@ #include "opencl/test/unit_test/mocks/mock_graphics_allocation.h" #include "test.h" +#include "level_zero/core/source/builtin/builtin_functions_lib_impl.h" #include "level_zero/core/source/cmdqueue/cmdqueue_imp.h" +#include "level_zero/core/source/kernel/kernel_imp.h" #include "level_zero/core/test/unit_tests/fixtures/device_fixture.h" #include "level_zero/core/test/unit_tests/mocks/mock_cmdlist.h" #include "level_zero/core/test/unit_tests/mocks/mock_event.h" @@ -492,6 +494,208 @@ HWTEST2_F(AppendQueryKernelTimestamps, givenCommandListWhenAppendQueryKernelTime driverHandle->freeMem(offsetAlloc); } +HWTEST2_F(AppendQueryKernelTimestamps, givenCommandListWhenAppendQueryKernelTimestampsWithEventsNumberBiggerThanMaxWorkItemSizeThenProperGroupSizeAndGroupCountIsSet, TestPlatforms) { + MockCommandListForAppendLaunchKernel commandList; + commandList.initialize(device, false); + + device->getBuiltinFunctionsLib()->initFunctions(); + MockEvent event; + event.waitScope = ZE_EVENT_SCOPE_FLAG_HOST; + event.signalScope = ZE_EVENT_SCOPE_FLAG_HOST; + + void *alloc; + auto result = driverHandle->allocDeviceMem(device, ZE_DEVICE_MEM_ALLOC_FLAG_FORCE_UINT32, 128, 1, &alloc); + EXPECT_EQ(result, ZE_RESULT_SUCCESS); + size_t eventCount = device->getNEODevice()->getDeviceInfo().maxWorkItemSizes[0] * 2u; + std::unique_ptr events = std::make_unique(eventCount); + + for (size_t i = 0u; i < eventCount; ++i) { + events[i] = event.toHandle(); + } + + result = commandList.appendQueryKernelTimestamps(static_cast(eventCount), events.get(), alloc, nullptr, nullptr, 0u, nullptr); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + EXPECT_EQ(device->getBuiltinFunctionsLib()->getFunction(Builtin::QueryKernelTimestamps)->getIsaAllocation()->getGpuAddress(), commandList.cmdListHelper.isaAllocation->getGpuAddress()); + + uint32_t groupSizeX = static_cast(eventCount); + uint32_t groupSizeY = 1u; + uint32_t groupSizeZ = 1u; + + device->getBuiltinFunctionsLib()->getFunction(Builtin::QueryKernelTimestamps)->suggestGroupSize(groupSizeX, groupSizeY, groupSizeZ, &groupSizeX, &groupSizeY, &groupSizeZ); + + EXPECT_EQ(groupSizeX, commandList.cmdListHelper.groupSize[0]); + EXPECT_EQ(groupSizeY, commandList.cmdListHelper.groupSize[1]); + EXPECT_EQ(groupSizeZ, commandList.cmdListHelper.groupSize[2]); + + EXPECT_EQ(static_cast(eventCount) / groupSizeX, commandList.cmdListHelper.threadGroupDimensions.groupCountX); + EXPECT_EQ(1u, commandList.cmdListHelper.threadGroupDimensions.groupCountY); + EXPECT_EQ(1u, commandList.cmdListHelper.threadGroupDimensions.groupCountZ); + + driverHandle->freeMem(alloc); +} + +HWTEST2_F(AppendQueryKernelTimestamps, givenCommandListWhenAppendQueryKernelTimestampsAndInvalidResultSuggestGroupSizeThanUnknownResultReturned, TestPlatforms) { + class MockQueryKernelTimestampsKernel : public L0::KernelImp { + public: + ze_result_t suggestGroupSize(uint32_t globalSizeX, uint32_t globalSizeY, + uint32_t globalSizeZ, uint32_t *groupSizeX, + uint32_t *groupSizeY, uint32_t *groupSizeZ) override { + return ZE_RESULT_ERROR_UNKNOWN; + } + void setBufferSurfaceState(uint32_t argIndex, void *address, NEO::GraphicsAllocation *alloc) override { + return; + } + void evaluateIfRequiresGenerationOfLocalIdsByRuntime(const NEO::KernelDescriptor &kernelDescriptor) override { + return; + } + std::unique_ptr clone() const override { return nullptr; } + }; + struct MockBuiltinFunctionsLibImpl : BuiltinFunctionsLibImpl { + + using BuiltinFunctionsLibImpl::builtins; + using BuiltinFunctionsLibImpl::getFunction; + using BuiltinFunctionsLibImpl::imageBuiltins; + MockBuiltinFunctionsLibImpl(L0::Device *device, NEO::BuiltIns *builtInsLib) : BuiltinFunctionsLibImpl(device, builtInsLib) {} + }; + struct MockBuiltinFunctionsForQueryKernelTimestamps : BuiltinFunctionsLibImpl { + MockBuiltinFunctionsForQueryKernelTimestamps(L0::Device *device, NEO::BuiltIns *builtInsLib) : BuiltinFunctionsLibImpl(device, builtInsLib) { + tmpMockKernel = new MockQueryKernelTimestampsKernel; + } + MockQueryKernelTimestampsKernel *getFunction(Builtin func) override { + return tmpMockKernel; + } + ~MockBuiltinFunctionsForQueryKernelTimestamps() override { + delete tmpMockKernel; + } + MockQueryKernelTimestampsKernel *tmpMockKernel = nullptr; + }; + class MockDeviceHandle : public L0::DeviceImp { + public: + MockDeviceHandle() { + } + void initialize(L0::Device *device) { + neoDevice = device->getNEODevice(); + neoDevice->incRefInternal(); + execEnvironment = device->getExecEnvironment(); + driverHandle = device->getDriverHandle(); + tmpMockBultinLib = new MockBuiltinFunctionsForQueryKernelTimestamps{nullptr, nullptr}; + } + MockBuiltinFunctionsForQueryKernelTimestamps *getBuiltinFunctionsLib() override { + return tmpMockBultinLib; + } + ~MockDeviceHandle() override { + delete tmpMockBultinLib; + } + MockBuiltinFunctionsForQueryKernelTimestamps *tmpMockBultinLib = nullptr; + }; + + MockDeviceHandle mockDevice; + mockDevice.initialize(device); + + MockCommandListForAppendLaunchKernel commandList; + + commandList.initialize(&mockDevice, false); + + MockEvent event; + ze_event_handle_t events[2] = {event.toHandle(), event.toHandle()}; + event.waitScope = ZE_EVENT_SCOPE_FLAG_HOST; + event.signalScope = ZE_EVENT_SCOPE_FLAG_HOST; + + void *alloc; + auto result = driverHandle->allocDeviceMem(&mockDevice, ZE_DEVICE_MEM_ALLOC_FLAG_FORCE_UINT32, 128, 1, &alloc); + EXPECT_EQ(result, ZE_RESULT_SUCCESS); + + result = commandList.appendQueryKernelTimestamps(2u, events, alloc, nullptr, nullptr, 0u, nullptr); + EXPECT_EQ(ZE_RESULT_ERROR_UNKNOWN, result); + + driverHandle->freeMem(alloc); +} + +HWTEST2_F(AppendQueryKernelTimestamps, givenCommandListWhenAppendQueryKernelTimestampsAndInvalidResultSetGroupSizeThanUnknownResultReturned, TestPlatforms) { + class MockQueryKernelTimestampsKernel : public L0::KernelImp { + public: + ze_result_t suggestGroupSize(uint32_t globalSizeX, uint32_t globalSizeY, + uint32_t globalSizeZ, uint32_t *groupSizeX, + uint32_t *groupSizeY, uint32_t *groupSizeZ) override { + *groupSizeX = static_cast(1u); + *groupSizeY = static_cast(1u); + *groupSizeZ = static_cast(1u); + return ZE_RESULT_SUCCESS; + } + ze_result_t setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY, + uint32_t groupSizeZ) override { + return ZE_RESULT_ERROR_UNKNOWN; + } + void setBufferSurfaceState(uint32_t argIndex, void *address, NEO::GraphicsAllocation *alloc) override { + return; + } + void evaluateIfRequiresGenerationOfLocalIdsByRuntime(const NEO::KernelDescriptor &kernelDescriptor) override { + return; + } + std::unique_ptr clone() const override { return nullptr; } + }; + struct MockBuiltinFunctionsLibImpl : BuiltinFunctionsLibImpl { + + using BuiltinFunctionsLibImpl::builtins; + using BuiltinFunctionsLibImpl::getFunction; + using BuiltinFunctionsLibImpl::imageBuiltins; + MockBuiltinFunctionsLibImpl(L0::Device *device, NEO::BuiltIns *builtInsLib) : BuiltinFunctionsLibImpl(device, builtInsLib) {} + }; + struct MockBuiltinFunctionsForQueryKernelTimestamps : BuiltinFunctionsLibImpl { + MockBuiltinFunctionsForQueryKernelTimestamps(L0::Device *device, NEO::BuiltIns *builtInsLib) : BuiltinFunctionsLibImpl(device, builtInsLib) { + tmpMockKernel = new MockQueryKernelTimestampsKernel; + } + MockQueryKernelTimestampsKernel *getFunction(Builtin func) override { + return tmpMockKernel; + } + ~MockBuiltinFunctionsForQueryKernelTimestamps() override { + delete tmpMockKernel; + } + MockQueryKernelTimestampsKernel *tmpMockKernel = nullptr; + }; + class MockDeviceHandle : public L0::DeviceImp { + public: + MockDeviceHandle() { + } + void initialize(L0::Device *device) { + neoDevice = device->getNEODevice(); + neoDevice->incRefInternal(); + execEnvironment = device->getExecEnvironment(); + driverHandle = device->getDriverHandle(); + tmpMockBultinLib = new MockBuiltinFunctionsForQueryKernelTimestamps{nullptr, nullptr}; + } + MockBuiltinFunctionsForQueryKernelTimestamps *getBuiltinFunctionsLib() override { + return tmpMockBultinLib; + } + ~MockDeviceHandle() override { + delete tmpMockBultinLib; + } + MockBuiltinFunctionsForQueryKernelTimestamps *tmpMockBultinLib = nullptr; + }; + + MockDeviceHandle mockDevice; + mockDevice.initialize(device); + + MockCommandListForAppendLaunchKernel commandList; + + commandList.initialize(&mockDevice, false); + + MockEvent event; + ze_event_handle_t events[2] = {event.toHandle(), event.toHandle()}; + event.waitScope = ZE_EVENT_SCOPE_FLAG_HOST; + event.signalScope = ZE_EVENT_SCOPE_FLAG_HOST; + + void *alloc; + auto result = driverHandle->allocDeviceMem(&mockDevice, ZE_DEVICE_MEM_ALLOC_FLAG_FORCE_UINT32, 128, 1, &alloc); + EXPECT_EQ(result, ZE_RESULT_SUCCESS); + + result = commandList.appendQueryKernelTimestamps(2u, events, alloc, nullptr, nullptr, 0u, nullptr); + EXPECT_EQ(ZE_RESULT_ERROR_UNKNOWN, result); + + driverHandle->freeMem(alloc); +} + HWTEST_F(CommandListCreate, givenCommandListWithCopyOnlyWhenAppendSignalEventThenMiFlushDWIsProgrammed) { using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW; std::unique_ptr commandList(CommandList::create(productFamily, device, true));