diff --git a/opencl/test/unit_test/command_stream/cl_command_stream_receiver_tests.cpp b/opencl/test/unit_test/command_stream/cl_command_stream_receiver_tests.cpp index ae19ec0745..db6ac0e8bf 100644 --- a/opencl/test/unit_test/command_stream/cl_command_stream_receiver_tests.cpp +++ b/opencl/test/unit_test/command_stream/cl_command_stream_receiver_tests.cpp @@ -48,7 +48,7 @@ TEST(ClCommandStreamReceiverTest, WhenMakingResidentThenBufferResidencyFlagIsSet using ClCommandStreamReceiverTests = Test; HWTEST_F(ClCommandStreamReceiverTests, givenCommandStreamReceiverWhenFenceAllocationIsRequiredAndCreateGlobalFenceAllocationIsCalledThenFenceAllocationIsAllocated) { - RAIIGfxCoreHelperFactory> gfxCoreHelperBackup{ + RAIIGfxCoreHelperFactory> gfxCoreHelperBackup{ *pDevice->executionEnvironment->rootDeviceEnvironments[pDevice->getRootDeviceIndex()]}; MockCsrHw csr(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); @@ -62,7 +62,7 @@ HWTEST_F(ClCommandStreamReceiverTests, givenCommandStreamReceiverWhenFenceAlloca } HWTEST_F(ClCommandStreamReceiverTests, givenCommandStreamReceiverWhenGettingFenceAllocationThenCorrectFenceAllocationIsReturned) { - RAIIGfxCoreHelperFactory> gfxCoreHelperBackup{ + RAIIGfxCoreHelperFactory> gfxCoreHelperBackup{ *pDevice->executionEnvironment->rootDeviceEnvironments[pDevice->getRootDeviceIndex()]}; CommandStreamReceiverHw csr(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_2_tests.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_2_tests.cpp index b36dd149c4..98e65bc56f 100644 --- a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_2_tests.cpp +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_2_tests.cpp @@ -539,7 +539,7 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, WhenFlushingThenScratchAllocationI } HWTEST_F(CommandStreamReceiverFlushTaskTests, givenCommandStreamReceiverWhenFenceAllocationIsRequiredAndFlushTaskIsCalledThenFenceAllocationIsMadeResident) { - RAIIGfxCoreHelperFactory> gfxCoreHelperBackup{ + RAIIGfxCoreHelperFactory> gfxCoreHelperBackup{ *pDevice->executionEnvironment->rootDeviceEnvironments[pDevice->getRootDeviceIndex()]}; auto commandStreamReceiver = new MockCsrHw(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); @@ -558,7 +558,7 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, givenCommandStreamReceiverWhenFenc } HWTEST_F(CommandStreamReceiverFlushTaskTests, givenCommandStreamReceiverWhenFenceAllocationIsRequiredAndCreatedThenItIsMadeResidentDuringFlushSmallTask) { - RAIIGfxCoreHelperFactory> gfxCoreHelperBackup{ + RAIIGfxCoreHelperFactory> gfxCoreHelperBackup{ *pDevice->executionEnvironment->rootDeviceEnvironments[pDevice->getRootDeviceIndex()]}; MockCsrHw csr(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); @@ -580,7 +580,7 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, givenCommandStreamReceiverWhenFenc } HWTEST_F(CommandStreamReceiverFlushTaskTests, givenCommandStreamReceiverWhenFenceAllocationIsRequiredButNotCreatedThenItIsNotMadeResidentDuringFlushSmallTask) { - RAIIGfxCoreHelperFactory> gfxCoreHelperBackup{ + RAIIGfxCoreHelperFactory> gfxCoreHelperBackup{ *pDevice->executionEnvironment->rootDeviceEnvironments[pDevice->getRootDeviceIndex()]}; MockCsrHw csr(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_2_tests.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_2_tests.cpp index 40b53014e6..496cd0f96a 100644 --- a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_2_tests.cpp +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_2_tests.cpp @@ -809,7 +809,7 @@ HWTEST_F(BcsTests, givenInputAllocationsWhenBlitDispatchedThenMakeAllAllocations } HWTEST_F(BcsTests, givenFenceAllocationIsRequiredWhenBlitDispatchedThenMakeAllAllocationsResident) { - RAIIGfxCoreHelperFactory> gfxCoreHelperBackup{ + RAIIGfxCoreHelperFactory> gfxCoreHelperBackup{ *pDevice->getExecutionEnvironment()->rootDeviceEnvironments[pDevice->getRootDeviceIndex()]}; auto bcsOsContext = std::unique_ptr(OsContext::create(nullptr, pDevice->getRootDeviceIndex(), 0, EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS, EngineUsage::regular}, pDevice->getDeviceBitfield()))); diff --git a/opencl/test/unit_test/mem_obj/buffer_tests.cpp b/opencl/test/unit_test/mem_obj/buffer_tests.cpp index 79c094afd3..0429519118 100644 --- a/opencl/test/unit_test/mem_obj/buffer_tests.cpp +++ b/opencl/test/unit_test/mem_obj/buffer_tests.cpp @@ -1916,7 +1916,7 @@ HWTEST_F(BufferCreateTests, givenClMemCopyHostPointerPassedToBufferCreateWhenAll constexpr size_t bigBufferSize = smallBufferSize + 1; char memory[smallBufferSize]; char bigMemory[bigBufferSize]; - RAIIGfxCoreHelperFactory> overrideGfxCoreHelperHw{ + RAIIGfxCoreHelperFactory> overrideGfxCoreHelperHw{ *executionEnvironment->rootDeviceEnvironments[0]}; { @@ -2023,7 +2023,7 @@ HWTEST_F(BufferCreateTests, givenClMemCopyHostPointerPassedToBufferCreateWhenAll auto writeBufferCounter = commandQueue->writeBufferCounter; size_t lockResourceCalled = memoryManager->lockResourceCalled; - static_cast *>(executionEnvironment->rootDeviceEnvironments[0]->gfxCoreHelper.get())->setIsLockable = false; + static_cast *>(executionEnvironment->rootDeviceEnvironments[0]->gfxCoreHelper.get())->setIsLockable = false; std::unique_ptr buffer(Buffer::create(&context, flags, sizeof(memory), memory, retVal)); ASSERT_NE(nullptr, buffer.get()); diff --git a/opencl/test/unit_test/mem_obj/linux/buffer_linux_tests.cpp b/opencl/test/unit_test/mem_obj/linux/buffer_linux_tests.cpp index 9b66b6fb0b..b84946571c 100644 --- a/opencl/test/unit_test/mem_obj/linux/buffer_linux_tests.cpp +++ b/opencl/test/unit_test/mem_obj/linux/buffer_linux_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -161,7 +161,7 @@ HWTEST_F(BufferCreateLinuxTests, givenClMemCopyHostPointerPassedToBufferCreateWh context.setSpecialQueue(commandQueue, mockRootDeviceIndex); constexpr size_t smallBufferSize = Buffer::maxBufferSizeForCopyOnCpu; char memory[smallBufferSize]; - RAIIGfxCoreHelperFactory> overrideGfxCoreHelperHw{*executionEnvironment->rootDeviceEnvironments[0]}; + RAIIGfxCoreHelperFactory> overrideGfxCoreHelperHw{*executionEnvironment->rootDeviceEnvironments[0]}; { // cpu copy allowed @@ -202,7 +202,7 @@ HWTEST_F(BufferCreateLinuxTests, givenClMemCopyHostPointerPassedToBufferCreateWh context.setSpecialQueue(commandQueue, mockRootDeviceIndex); constexpr size_t bigBufferSize = Buffer::maxBufferSizeForCopyOnCpu + 1; char bigMemory[bigBufferSize]; - RAIIGfxCoreHelperFactory> overrideGfxCoreHelperHw{*executionEnvironment->rootDeviceEnvironments[0]}; + RAIIGfxCoreHelperFactory> overrideGfxCoreHelperHw{*executionEnvironment->rootDeviceEnvironments[0]}; { // buffer size over threshold -> cpu copy disallowed diff --git a/opencl/test/unit_test/mem_obj/windows/buffer_windows_tests.cpp b/opencl/test/unit_test/mem_obj/windows/buffer_windows_tests.cpp index d32e4cc551..ad2d91b206 100644 --- a/opencl/test/unit_test/mem_obj/windows/buffer_windows_tests.cpp +++ b/opencl/test/unit_test/mem_obj/windows/buffer_windows_tests.cpp @@ -162,7 +162,7 @@ HWTEST_F(BufferCreateWindowsTests, givenClMemCopyHostPointerPassedToBufferCreate context.setSpecialQueue(commandQueue, mockRootDeviceIndex); constexpr size_t smallBufferSize = Buffer::maxBufferSizeForCopyOnCpu; char memory[smallBufferSize]; - RAIIGfxCoreHelperFactory> overrideGfxCoreHelperHw{ + RAIIGfxCoreHelperFactory> overrideGfxCoreHelperHw{ *executionEnvironment->rootDeviceEnvironments[0]}; { @@ -204,7 +204,7 @@ HWTEST_F(BufferCreateWindowsTests, givenClMemCopyHostPointerPassedToBufferCreate context.setSpecialQueue(commandQueue, mockRootDeviceIndex); constexpr size_t bigBufferSize = Buffer::maxBufferSizeForCopyOnCpu + 1; char bigMemory[bigBufferSize]; - RAIIGfxCoreHelperFactory> overrideGfxCoreHelperHw{*executionEnvironment->rootDeviceEnvironments[0]}; + RAIIGfxCoreHelperFactory> overrideGfxCoreHelperHw{*executionEnvironment->rootDeviceEnvironments[0]}; { // buffer size over threshold -> cpu copy disallowed diff --git a/shared/source/helpers/kernel_helpers.cpp b/shared/source/helpers/kernel_helpers.cpp index 81a07427d4..9bf7beea19 100644 --- a/shared/source/helpers/kernel_helpers.cpp +++ b/shared/source/helpers/kernel_helpers.cpp @@ -54,16 +54,16 @@ uint32_t KernelHelper::getMaxWorkGroupCount(const RootDeviceEnvironment &rootDev UNRECOVERABLE_IF(workGroupSize == 0); auto numThreadsPerThreadGroup = static_cast(Math::divideAndRoundUp(workGroupSize, simdSize)); auto maxWorkGroupsCount = availableThreadCount / numThreadsPerThreadGroup; - - if (barrierCount > 0) { - auto maxWorkGroupsCountDueToBarrierUsage = dssCount * (maxBarrierCount / barrierCount); + if (barrierCount > 0 || usedSlmSize > 0) { helper.alignThreadGroupCountToDssSize(maxWorkGroupsCount, dssCount, availableThreadCount / dssCount, numThreadsPerThreadGroup); - maxWorkGroupsCount = std::min(maxWorkGroupsCount, maxWorkGroupsCountDueToBarrierUsage); - } - - if (usedSlmSize > 0) { - auto maxWorkGroupsCountDueToSlm = availableSlmSize / usedSlmSize; - maxWorkGroupsCount = std::min(maxWorkGroupsCount, maxWorkGroupsCountDueToSlm); + if (barrierCount > 0) { + auto maxWorkGroupsCountDueToBarrierUsage = dssCount * (maxBarrierCount / barrierCount); + maxWorkGroupsCount = std::min(maxWorkGroupsCount, maxWorkGroupsCountDueToBarrierUsage); + } + if (usedSlmSize > 0) { + auto maxWorkGroupsCountDueToSlm = availableSlmSize / usedSlmSize; + maxWorkGroupsCount = std::min(maxWorkGroupsCount, maxWorkGroupsCountDueToSlm); + } } maxWorkGroupsCount = helper.adjustMaxWorkGroupCount(maxWorkGroupsCount, engineGroupType, rootDeviceEnvironment); diff --git a/shared/test/common/mocks/mock_gfx_core_helper.h b/shared/test/common/mocks/mock_gfx_core_helper.h index fa6312b684..34b3c7fdcd 100644 --- a/shared/test/common/mocks/mock_gfx_core_helper.h +++ b/shared/test/common/mocks/mock_gfx_core_helper.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2023 Intel Corporation + * Copyright (C) 2020-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -13,26 +13,18 @@ namespace NEO { template -class MockGfxCoreHelperWithFenceAllocation : public GfxCoreHelperHw { +class MockGfxCoreHelperHw : public GfxCoreHelperHw { public: bool isFenceAllocationRequired(const HardwareInfo &hwInfo) const override { return true; } -}; - -template -class MockGfxCoreHelperWithLocalMemory : public GfxCoreHelperHw { - public: - bool isLocalMemoryEnabled(const HardwareInfo &hwInfo) const override { - return true; - } -}; - -template -struct MockGfxCoreHelperHwWithSetIsLockable : public GfxCoreHelperHw { void setExtraAllocationData(AllocationData &allocationData, const AllocationProperties &properties, const RootDeviceEnvironment &rootDeviceEnvironment) const override { allocationData.storageInfo.isLockable = setIsLockable; } + void alignThreadGroupCountToDssSize(uint32_t &threadCount, uint32_t dssCount, uint32_t threadsPerDss, uint32_t threadGroupSize) const override { + alignThreadGroupCountToDssSizeCalledTimes++; + } + mutable uint32_t alignThreadGroupCountToDssSizeCalledTimes = 0; bool setIsLockable = true; }; } // namespace NEO diff --git a/shared/test/unit_test/helpers/kernel_helpers_tests.cpp b/shared/test/unit_test/helpers/kernel_helpers_tests.cpp index cdbd0cdf1a..6a60ee7db5 100644 --- a/shared/test/unit_test/helpers/kernel_helpers_tests.cpp +++ b/shared/test/unit_test/helpers/kernel_helpers_tests.cpp @@ -12,9 +12,11 @@ #include "shared/test/common/fixtures/device_fixture.h" #include "shared/test/common/helpers/debug_manager_state_restore.h" #include "shared/test/common/helpers/mock_product_helper_hw.h" +#include "shared/test/common/helpers/raii_gfx_core_helper.h" #include "shared/test/common/helpers/raii_product_helper.h" #include "shared/test/common/mocks/mock_device.h" #include "shared/test/common/mocks/mock_execution_environment.h" +#include "shared/test/common/mocks/mock_gfx_core_helper.h" #include "shared/test/common/test_macros/hw_test.h" #include "shared/test/common/test_macros/test.h" @@ -107,16 +109,42 @@ HWTEST2_F(KernelHelperMaxWorkGroupsTests, GivenBarriersWhenCalculatingMaxWorkGro EXPECT_EQ(expected, getMaxWorkGroupCount()); } -TEST_F(KernelHelperMaxWorkGroupsTests, GivenUsedSlmSizeWhenCalculatingMaxWorkGroupsCountThenResultIsCalculatedWithRegardToUsedSlmSize) { +HWTEST2_F(KernelHelperMaxWorkGroupsTests, GivenUsedSlmSizeWhenCalculatingMaxWorkGroupsCountThenResultIsCalculatedWithRegardToUsedSlmSize, MatchAny) { + NEO::RAIIProductHelperFactory> raii(*rootDeviceEnvironment); + raii.mockProductHelper->isCooperativeEngineSupportedValue = false; usedSlm = 0; - auto baseCount = getMaxWorkGroupCount(); + lws[0] = 1; + lws[1] = 0; + lws[2] = 0; + workDim = 1; usedSlm = 4 * MemoryConstants::kiloByte; - auto expected = std::min(baseCount, availableSlm / usedSlm); + auto expected = availableSlm / usedSlm; EXPECT_EQ(expected, getMaxWorkGroupCount()); } +HWTEST_F(KernelHelperMaxWorkGroupsTests, givenUsedSlmSizeWhenCalculatingMaxWorkGroupsCountThenAlignToDssSizeCalled) { + auto raiiFactory = RAIIGfxCoreHelperFactory>(*rootDeviceEnvironment); + usedSlm = 4 * MemoryConstants::kiloByte; + getMaxWorkGroupCount(); + EXPECT_EQ(raiiFactory.mockGfxCoreHelper->alignThreadGroupCountToDssSizeCalledTimes, 1u); +} +HWTEST_F(KernelHelperMaxWorkGroupsTests, givenBarriersWhenCalculatingMaxWorkGroupsCountThenAlignToDssSizeCalled) { + auto raiiFactory = RAIIGfxCoreHelperFactory>(*rootDeviceEnvironment); + numberOfBarriers = 1; + getMaxWorkGroupCount(); + EXPECT_EQ(raiiFactory.mockGfxCoreHelper->alignThreadGroupCountToDssSizeCalledTimes, 1u); +} + +HWTEST_F(KernelHelperMaxWorkGroupsTests, givenZeroBarriersAndSlmNotUsedWhenCalculatingMaxWorkGroupsCountThenAlignToDssSizeNotCalled) { + auto raiiFactory = RAIIGfxCoreHelperFactory>(*rootDeviceEnvironment); + numberOfBarriers = 0; + usedSlm = 0; + getMaxWorkGroupCount(); + EXPECT_EQ(raiiFactory.mockGfxCoreHelper->alignThreadGroupCountToDssSizeCalledTimes, 0u); +} + TEST_F(KernelHelperMaxWorkGroupsTests, GivenVariousValuesWhenCalculatingMaxWorkGroupsCountThenLowestResultIsAlwaysReturned) { auto &helper = rootDeviceEnvironment->getHelper();