From dd1d52259e60851299883a43e2129d756f25cf6e Mon Sep 17 00:00:00 2001 From: Katarzyna Cencelewska Date: Thu, 21 Mar 2024 18:54:41 +0000 Subject: [PATCH] refactor: add param rootDeviceEnvironment to calculateNumThreadsPerThreadGroup Signed-off-by: Katarzyna Cencelewska --- level_zero/core/source/kernel/kernel_imp.cpp | 10 ++-- .../test_cmdlist_append_launch_kernel_2.cpp | 17 +++--- .../test_cmdlist_append_launch_kernel_3.cpp | 4 +- .../unit_tests/sources/kernel/test_kernel.cpp | 15 +++--- .../enable_hardware_commands_helper.inl | 3 +- .../source/helpers/hardware_commands_helper.h | 8 +-- .../helpers/hardware_commands_helper_base.inl | 14 ++--- ...hardware_commands_helper_bdw_and_later.inl | 11 ++-- ...ardware_commands_helper_xehp_and_later.inl | 8 +-- opencl/source/kernel/kernel.cpp | 7 ++- .../command_queue/dispatch_walker_tests.cpp | 17 +++--- .../get_size_required_buffer_tests.cpp | 6 +-- .../get_size_required_image_tests.cpp | 14 ++--- .../hardware_commands_helper_tests.cpp | 54 +++++++++++-------- .../command_encoder_bdw_and_later.inl | 4 +- .../command_encoder_xehp_and_later.inl | 8 +-- .../source/helpers/aarch64/local_id_gen.cpp | 8 +-- shared/source/helpers/gfx_core_helper.h | 8 +-- .../source/helpers/gfx_core_helper_base.inl | 4 +- shared/source/helpers/local_id_gen.h | 6 +-- shared/source/helpers/per_thread_data.h | 8 +-- shared/source/helpers/x86_64/local_id_gen.cpp | 8 +-- shared/source/kernel/implicit_args_helper.cpp | 13 ++--- shared/source/kernel/implicit_args_helper.h | 8 +-- shared/source/kernel/local_ids_cache.cpp | 18 ++++--- shared/source/kernel/local_ids_cache.h | 10 ++-- .../helpers/gfx_core_helper_tests.cpp | 13 +++-- .../test/unit_test/helpers/local_id_tests.cpp | 45 +++++++++------- .../kernel/implicit_args_helper_tests.cpp | 34 +++++++----- .../kernel/local_ids_cache_tests.cpp | 28 ++++++---- 30 files changed, 231 insertions(+), 180 deletions(-) diff --git a/level_zero/core/source/kernel/kernel_imp.cpp b/level_zero/core/source/kernel/kernel_imp.cpp index 24db3f8fc1..7541cf9270 100644 --- a/level_zero/core/source/kernel/kernel_imp.cpp +++ b/level_zero/core/source/kernel/kernel_imp.cpp @@ -381,13 +381,13 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY, auto &rootDeviceEnvironment = module->getDevice()->getNEODevice()->getRootDeviceEnvironment(); auto &gfxCoreHelper = rootDeviceEnvironment.getHelper(); this->numThreadsPerThreadGroup = gfxCoreHelper.calculateNumThreadsPerThreadGroup( - simdSize, static_cast(itemsInGroup), grfSize, !kernelRequiresGenerationOfLocalIdsByRuntime); + simdSize, static_cast(itemsInGroup), grfSize, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment); if (kernelRequiresGenerationOfLocalIdsByRuntime) { auto grfSize = this->module->getDevice()->getHwInfo().capabilityTable.grfSize; uint32_t perThreadDataSizeForWholeThreadGroupNeeded = static_cast(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal( - simdSize, grfSize, numChannels, itemsInGroup, !kernelRequiresGenerationOfLocalIdsByRuntime, gfxCoreHelper)); + simdSize, grfSize, numChannels, itemsInGroup, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment)); if (perThreadDataSizeForWholeThreadGroupNeeded > perThreadDataSizeForWholeThreadGroupAllocated) { alignedFree(perThreadDataForWholeThreadGroup); @@ -405,7 +405,7 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY, static_cast(groupSizeY), static_cast(groupSizeZ)}}, std::array{{0, 1, 2}}, - false, grfSize, gfxCoreHelper); + false, grfSize, rootDeviceEnvironment); } this->perThreadDataSize = perThreadDataSizeForWholeThreadGroup / numThreadsPerThreadGroup; @@ -902,8 +902,8 @@ ze_result_t KernelImp::getProperties(ze_kernel_properties_t *pKernelProperties) memset(pKernelProperties->uuid.mid, 0, ZE_MAX_MODULE_UUID_SIZE); uint32_t maxKernelWorkGroupSize = static_cast(this->module->getMaxGroupSize(kernelDescriptor)); - - maxKernelWorkGroupSize = gfxCoreHelper.adjustMaxWorkGroupSize(kernelDescriptor.kernelAttributes.numGrfRequired, kernelDescriptor.kernelAttributes.simdSize, !kernelRequiresGenerationOfLocalIdsByRuntime, maxKernelWorkGroupSize); + const auto &rootDeviceEnvironment = this->module->getDevice()->getNEODevice()->getRootDeviceEnvironment(); + maxKernelWorkGroupSize = gfxCoreHelper.adjustMaxWorkGroupSize(kernelDescriptor.kernelAttributes.numGrfRequired, kernelDescriptor.kernelAttributes.simdSize, !kernelRequiresGenerationOfLocalIdsByRuntime, maxKernelWorkGroupSize, rootDeviceEnvironment); pKernelProperties->maxNumSubgroups = maxKernelWorkGroupSize / kernelDescriptor.kernelAttributes.simdSize; void *pNext = pKernelProperties->pNext; diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp index bc21c333a3..dca3975ff6 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp @@ -996,8 +996,9 @@ struct CmdlistAppendLaunchKernelWithImplicitArgsTests : CmdlistAppendLaunchKerne result = commandList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false); EXPECT_EQ(ZE_RESULT_SUCCESS, result); - const auto &gfxCoreHelper = device->getGfxCoreHelper(); - implicitArgsProgrammingSize = ImplicitArgsHelper::getSizeForImplicitArgsPatching(&expectedImplicitArgs, *kernelDescriptor, !kernelRequiresGenerationOfLocalIdsByRuntime, gfxCoreHelper); + const auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironment(); + const auto &gfxCoreHelper = device->getNEODevice()->getGfxCoreHelper(); + implicitArgsProgrammingSize = ImplicitArgsHelper::getSizeForImplicitArgsPatching(&expectedImplicitArgs, *kernelDescriptor, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment); auto sizeCrossThreadData = kernel->getCrossThreadDataSize(); auto sizePerThreadDataForWholeGroup = kernel->getPerThreadDataSizeForWholeThreadGroup(); EXPECT_EQ(indirectHeap->getUsed(), alignUp(sizeCrossThreadData + sizePerThreadDataForWholeGroup + implicitArgsProgrammingSize, gfxCoreHelper.getIOHAlignment())); @@ -1029,11 +1030,11 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CmdlistAppendLaunchKernelWithImplicitArgsTests, giv auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth); auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgs::getSize(), MemoryConstants::cacheLineSize); - const auto &gfxCoreHelper = device->getNEODevice()->getGfxCoreHelper(); - generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize, gfxCoreHelper); + const auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironment(); + generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize, rootDeviceEnvironment); auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgs::getSize(); - size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize, !kernelRequiresGenerationOfLocalIdsByRuntime, gfxCoreHelper); + size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment); EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds)); alignedFree(expectedLocalIds); @@ -1075,11 +1076,11 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CmdlistAppendLaunchKernelWithImplicitArgsTests, giv auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth); auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgs::getSize(), MemoryConstants::cacheLineSize); - const auto &gfxCoreHelper = device->getNEODevice()->getGfxCoreHelper(); - generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize, gfxCoreHelper); + const auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironment(); + generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize, rootDeviceEnvironment); auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgs::getSize(); - size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize, !kernelRequiresGenerationOfLocalIdsByRuntime, gfxCoreHelper); + size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment); EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds)); alignedFree(expectedLocalIds); diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp index 42ce2d963a..75bc316f94 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp @@ -732,8 +732,8 @@ struct CommandListAppendLaunchKernelWithImplicitArgs : CommandListAppendLaunchKe template uint64_t getIndirectHeapOffsetForImplicitArgsBuffer(const Mock<::L0::KernelImp> &kernel) { if (FamilyType::supportsCmdSet(IGFX_XE_HP_CORE)) { - const auto &gfxCoreHelper = device->getGfxCoreHelper(); - auto implicitArgsProgrammingSize = ImplicitArgsHelper::getSizeForImplicitArgsPatching(kernel.pImplicitArgs.get(), kernel.getKernelDescriptor(), !kernel.kernelRequiresGenerationOfLocalIdsByRuntime, gfxCoreHelper); + const auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironment(); + auto implicitArgsProgrammingSize = ImplicitArgsHelper::getSizeForImplicitArgsPatching(kernel.pImplicitArgs.get(), kernel.getKernelDescriptor(), !kernel.kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment); return implicitArgsProgrammingSize - ImplicitArgs::getSize(); } else { return 0u; diff --git a/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp b/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp index 2ee26c7141..4572cfeec1 100644 --- a/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp +++ b/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp @@ -304,7 +304,9 @@ TEST_F(KernelImpSetGroupSizeTest, givenLocalIdGenerationByRuntimeEnabledWhenSett mockKernel.kernelRequiresGenerationOfLocalIdsByRuntime = true; // although it is enabled for SIMD 1, make sure it is enforced mockKernel.descriptor.kernelAttributes.numLocalIdChannels = 3; mockKernel.module = &mockModule; - auto grfSize = mockModule.getDevice()->getHwInfo().capabilityTable.grfSize; + const auto &device = mockModule.getDevice(); + auto grfSize = device->getHwInfo().capabilityTable.grfSize; + const auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironment(); uint32_t groupSize[3] = {2, 3, 5}; auto ret = mockKernel.setGroupSize(groupSize[0], groupSize[1], groupSize[2]); EXPECT_EQ(ZE_RESULT_SUCCESS, ret); @@ -314,7 +316,8 @@ TEST_F(KernelImpSetGroupSizeTest, givenLocalIdGenerationByRuntimeEnabledWhenSett mockKernel.descriptor.kernelAttributes.simdSize, groupSize[0] * groupSize[1] * groupSize[2], grfSize, - mockKernel.kernelRequiresGenerationOfLocalIdsByRuntime); + mockKernel.kernelRequiresGenerationOfLocalIdsByRuntime, + rootDeviceEnvironment); auto perThreadDataSizeForWholeTGNeeded = static_cast(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal( mockKernel.descriptor.kernelAttributes.simdSize, @@ -322,7 +325,7 @@ TEST_F(KernelImpSetGroupSizeTest, givenLocalIdGenerationByRuntimeEnabledWhenSett mockKernel.descriptor.kernelAttributes.numLocalIdChannels, groupSize[0] * groupSize[1] * groupSize[2], !mockKernel.kernelRequiresGenerationOfLocalIdsByRuntime, - gfxHelper)); + rootDeviceEnvironment)); EXPECT_EQ(numThreadsPerTG, mockKernel.getNumThreadsPerThreadGroup()); EXPECT_EQ((perThreadDataSizeForWholeTGNeeded / numThreadsPerTG), mockKernel.perThreadDataSize); @@ -1692,9 +1695,9 @@ TEST_F(KernelPropertiesTests, whenPassingKernelMaxGroupSizePropertiesStructToGet ze_result_t res = kernel->getProperties(&kernelProperties); EXPECT_EQ(ZE_RESULT_SUCCESS, res); - - auto &gfxCoreHelper = module->getDevice()->getGfxCoreHelper(); - uint32_t maxKernelWorkGroupSize = gfxCoreHelper.adjustMaxWorkGroupSize(kernelDescriptor.kernelAttributes.numGrfRequired, kernelDescriptor.kernelAttributes.simdSize, false, static_cast(this->module->getMaxGroupSize(kernelDescriptor))); + auto &device = *module->getDevice(); + auto &gfxCoreHelper = device.getGfxCoreHelper(); + uint32_t maxKernelWorkGroupSize = gfxCoreHelper.adjustMaxWorkGroupSize(kernelDescriptor.kernelAttributes.numGrfRequired, kernelDescriptor.kernelAttributes.simdSize, false, static_cast(this->module->getMaxGroupSize(kernelDescriptor)), device.getNEODevice()->getRootDeviceEnvironment()); EXPECT_EQ(maxKernelWorkGroupSize, maxGroupSizeProperties.maxGroupSize); } diff --git a/opencl/source/helpers/enable_hardware_commands_helper.inl b/opencl/source/helpers/enable_hardware_commands_helper.inl index 9c739592df..1de60853b4 100644 --- a/opencl/source/helpers/enable_hardware_commands_helper.inl +++ b/opencl/source/helpers/enable_hardware_commands_helper.inl @@ -33,7 +33,8 @@ template size_t NEO::HardwareCommandsHelper::sendCrossThreadDat bool inlineDataProgrammingRequired, FamilyType::DefaultWalkerType *walkerCmd, uint32_t &sizeCrossThreadData, - uint64_t scratchAddress); + uint64_t scratchAddress, + const RootDeviceEnvironment &rootDeviceEnvironment); template size_t NEO::HardwareCommandsHelper::sendInterfaceDescriptorData( const IndirectHeap &indirectHeap, diff --git a/opencl/source/helpers/hardware_commands_helper.h b/opencl/source/helpers/hardware_commands_helper.h index 214e103027..454e4eaadd 100644 --- a/opencl/source/helpers/hardware_commands_helper.h +++ b/opencl/source/helpers/hardware_commands_helper.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2019-2023 Intel Corporation + * Copyright (C) 2019-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -72,7 +72,8 @@ struct HardwareCommandsHelper : public PerThreadDataHelper { bool inlineDataProgrammingRequired, WalkerType *walkerCmd, uint32_t &sizeCrossThreadData, - uint64_t scratchAddress); + uint64_t scratchAddress, + const RootDeviceEnvironment &rootDeviceEnvironment); template static size_t sendIndirectState( @@ -111,7 +112,8 @@ struct HardwareCommandsHelper : public PerThreadDataHelper { const Kernel &kernel); static size_t getSizeRequiredIOH( const Kernel &kernel, - const size_t localWorkSizes[3]); + const size_t localWorkSizes[3], + const RootDeviceEnvironment &rootDeviceEnvironment); static size_t getSizeRequiredSSH( const Kernel &kernel); diff --git a/opencl/source/helpers/hardware_commands_helper_base.inl b/opencl/source/helpers/hardware_commands_helper_base.inl index 654e3670c6..06815af138 100644 --- a/opencl/source/helpers/hardware_commands_helper_base.inl +++ b/opencl/source/helpers/hardware_commands_helper_base.inl @@ -48,12 +48,11 @@ size_t HardwareCommandsHelper::getSizeRequiredDSH(const Kernel &kerne template size_t HardwareCommandsHelper::getSizeRequiredIOH(const Kernel &kernel, - const size_t localWorkSizes[3]) { + const size_t localWorkSizes[3], const RootDeviceEnvironment &rootDeviceEnvironment) { auto localWorkSize = Math::computeTotalElementsCount(localWorkSizes); typedef typename GfxFamily::DefaultWalkerType DefaultWalkerType; const auto &kernelDescriptor = kernel.getDescriptor(); const auto &hwInfo = kernel.getHardwareInfo(); - const auto &gfxCoreHelper = kernel.getGfxCoreHelper(); auto numChannels = kernelDescriptor.kernelAttributes.numLocalIdChannels; uint32_t grfSize = hwInfo.capabilityTable.grfSize; @@ -70,11 +69,11 @@ size_t HardwareCommandsHelper::getSizeRequiredIOH(const Kernel &kerne requiredWalkOrder, simdSize); auto size = kernel.getCrossThreadDataSize() + - getPerThreadDataSizeTotal(simdSize, grfSize, numChannels, localWorkSize, isHwLocalIdGeneration, gfxCoreHelper); + getPerThreadDataSizeTotal(simdSize, grfSize, numChannels, localWorkSize, isHwLocalIdGeneration, rootDeviceEnvironment); auto pImplicitArgs = kernel.getImplicitArgs(); if (pImplicitArgs) { - size += ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, isHwLocalIdGeneration, gfxCoreHelper); + size += ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, isHwLocalIdGeneration, rootDeviceEnvironment); } return alignUp(size, DefaultWalkerType::INDIRECTDATASTARTADDRESS_ALIGN_SIZE); } @@ -110,7 +109,8 @@ size_t HardwareCommandsHelper::getTotalSizeRequiredIOH( const MultiDispatchInfo &multiDispatchInfo) { return getSizeRequired(multiDispatchInfo, [](const DispatchInfo &dispatchInfo) { return getSizeRequiredIOH( *dispatchInfo.getKernel(), - dispatchInfo.getLocalWorkgroupSize().values); }); + dispatchInfo.getLocalWorkgroupSize().values, + dispatchInfo.getClDevice().getRootDeviceEnvironment()); }); } template @@ -270,14 +270,14 @@ size_t HardwareCommandsHelper::sendIndirectState( auto &gfxCoreHelper = device.getGfxCoreHelper(); auto grfSize = kernel.getDescriptor().kernelAttributes.numGrfRequired; auto localWorkItems = localWorkSize[0] * localWorkSize[1] * localWorkSize[2]; - auto threadsPerThreadGroup = gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast(localWorkItems), grfSize, !localIdsGenerationByRuntime); + auto threadsPerThreadGroup = gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast(localWorkItems), grfSize, !localIdsGenerationByRuntime, device.getRootDeviceEnvironment()); uint32_t sizeCrossThreadData = kernel.getCrossThreadDataSize(); auto inlineDataProgrammingRequired = EncodeDispatchKernel::inlineDataProgrammingRequired(kernel.getKernelInfo().kernelDescriptor); size_t offsetCrossThreadData = HardwareCommandsHelper::sendCrossThreadData( ioh, kernel, inlineDataProgrammingRequired, - walkerCmd, sizeCrossThreadData, scratchAddress); + walkerCmd, sizeCrossThreadData, scratchAddress, device.getRootDeviceEnvironment()); size_t sizePerThreadDataTotal = 0; size_t sizePerThreadData = 0; diff --git a/opencl/source/helpers/hardware_commands_helper_bdw_and_later.inl b/opencl/source/helpers/hardware_commands_helper_bdw_and_later.inl index 9f2c5cf408..a959a304b4 100644 --- a/opencl/source/helpers/hardware_commands_helper_bdw_and_later.inl +++ b/opencl/source/helpers/hardware_commands_helper_bdw_and_later.inl @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2023 Intel Corporation + * Copyright (C) 2020-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -78,20 +78,21 @@ size_t HardwareCommandsHelper::sendCrossThreadData( bool inlineDataProgrammingRequired, WalkerType *walkerCmd, uint32_t &sizeCrossThreadData, - uint64_t scratchAddress) { + uint64_t scratchAddress, + const RootDeviceEnvironment &rootDeviceEnvironment) { indirectHeap.align(WalkerType::INDIRECTDATASTARTADDRESS_ALIGN_SIZE); auto pImplicitArgs = kernel.getImplicitArgs(); if (pImplicitArgs) { const auto &kernelDescriptor = kernel.getDescriptor(); - const auto &gfxCoreHelper = kernel.getGfxCoreHelper(); + auto isHwLocalIdGeneration = false; - auto sizeForImplicitArgsProgramming = ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, isHwLocalIdGeneration, gfxCoreHelper); + auto sizeForImplicitArgsProgramming = ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, isHwLocalIdGeneration, rootDeviceEnvironment); auto implicitArgsGpuVA = indirectHeap.getGraphicsAllocation()->getGpuAddress() + indirectHeap.getUsed(); auto ptrToPatchImplicitArgs = indirectHeap.getSpace(sizeForImplicitArgsProgramming); - ImplicitArgsHelper::patchImplicitArgs(ptrToPatchImplicitArgs, *pImplicitArgs, kernelDescriptor, {}, gfxCoreHelper); + ImplicitArgsHelper::patchImplicitArgs(ptrToPatchImplicitArgs, *pImplicitArgs, kernelDescriptor, {}, rootDeviceEnvironment); auto implicitArgsCrossThreadPtr = ptrOffset(reinterpret_cast(kernel.getCrossThreadData()), kernelDescriptor.payloadMappings.implicitArgs.implicitArgsBuffer); *implicitArgsCrossThreadPtr = implicitArgsGpuVA; diff --git a/opencl/source/helpers/hardware_commands_helper_xehp_and_later.inl b/opencl/source/helpers/hardware_commands_helper_xehp_and_later.inl index 7d3ac6f65a..15c9f1bee5 100644 --- a/opencl/source/helpers/hardware_commands_helper_xehp_and_later.inl +++ b/opencl/source/helpers/hardware_commands_helper_xehp_and_later.inl @@ -59,7 +59,8 @@ size_t HardwareCommandsHelper::sendCrossThreadData( bool inlineDataProgrammingRequired, WalkerType *walkerCmd, uint32_t &sizeCrossThreadData, - [[maybe_unused]] uint64_t scratchAddress) { + [[maybe_unused]] uint64_t scratchAddress, + const RootDeviceEnvironment &rootDeviceEnvironment) { indirectHeap.align(GfxFamily::indirectDataAlignment); @@ -87,15 +88,14 @@ size_t HardwareCommandsHelper::sendCrossThreadData( requiredWalkOrder, kernelDescriptor.kernelAttributes.simdSize); - const auto &gfxCoreHelper = kernel.getGfxCoreHelper(); - auto sizeForImplicitArgsProgramming = ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, !generationOfLocalIdsByRuntime, gfxCoreHelper); + auto sizeForImplicitArgsProgramming = ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, !generationOfLocalIdsByRuntime, rootDeviceEnvironment); auto sizeForLocalIdsProgramming = sizeForImplicitArgsProgramming - ImplicitArgs::getSize(); offsetCrossThreadData += sizeForLocalIdsProgramming; auto ptrToPatchImplicitArgs = indirectHeap.getSpace(sizeForImplicitArgsProgramming); - ImplicitArgsHelper::patchImplicitArgs(ptrToPatchImplicitArgs, *pImplicitArgs, kernelDescriptor, std::make_pair(generationOfLocalIdsByRuntime, requiredWalkOrder), gfxCoreHelper); + ImplicitArgsHelper::patchImplicitArgs(ptrToPatchImplicitArgs, *pImplicitArgs, kernelDescriptor, std::make_pair(generationOfLocalIdsByRuntime, requiredWalkOrder), rootDeviceEnvironment); } uint32_t sizeToCopy = sizeCrossThreadData; diff --git a/opencl/source/kernel/kernel.cpp b/opencl/source/kernel/kernel.cpp index 3a57d6877c..3de6570fcf 100644 --- a/opencl/source/kernel/kernel.cpp +++ b/opencl/source/kernel/kernel.cpp @@ -2201,7 +2201,7 @@ void Kernel::reconfigureKernel() { bool isLocalIdsGeneratedByHw = false; // if local ids generated by runtime then more work groups available maxWorkGroupSize = static_cast(kernelInfo.getMaxRequiredWorkGroupSize(maxWorkGroupSize)); - this->maxKernelWorkGroupSize = gfxCoreHelper.adjustMaxWorkGroupSize(kernelDescriptor.kernelAttributes.numGrfRequired, kernelDescriptor.kernelAttributes.simdSize, isLocalIdsGeneratedByHw, maxWorkGroupSize); + this->maxKernelWorkGroupSize = gfxCoreHelper.adjustMaxWorkGroupSize(kernelDescriptor.kernelAttributes.numGrfRequired, kernelDescriptor.kernelAttributes.simdSize, isLocalIdsGeneratedByHw, maxWorkGroupSize, getDevice().getRootDeviceEnvironment()); this->containsStatelessWrites = kernelDescriptor.kernelAttributes.flags.usesStatelessWrites; this->systolicPipelineSelectMode = kernelDescriptor.kernelAttributes.flags.usesSystolicPipelineSelectMode; @@ -2287,13 +2287,12 @@ void Kernel::initializeLocalIdsCache() { void Kernel::setLocalIdsForGroup(const Vec3 &groupSize, void *destination) const { UNRECOVERABLE_IF(localIdsCache.get() == nullptr); - const auto &gfxCoreHelper = this->getGfxCoreHelper(); - localIdsCache->setLocalIdsForGroup(groupSize, destination, gfxCoreHelper); + localIdsCache->setLocalIdsForGroup(groupSize, destination, clDevice.getRootDeviceEnvironment()); } size_t Kernel::getLocalIdsSizeForGroup(const Vec3 &groupSize) const { UNRECOVERABLE_IF(localIdsCache.get() == nullptr); - return localIdsCache->getLocalIdsSizeForGroup(groupSize, getGfxCoreHelper()); + return localIdsCache->getLocalIdsSizeForGroup(groupSize, clDevice.getRootDeviceEnvironment()); } size_t Kernel::getLocalIdsSizePerThread() const { diff --git a/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp b/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp index f15e77666b..e5fec316cb 100644 --- a/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp +++ b/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp @@ -742,7 +742,7 @@ HWTEST_F(DispatchWalkerTest, GivenBlockedQueueWhenDispatchingWalkerThenRequiredH walkerArgs); auto expectedSizeDSH = HardwareCommandsHelper::getSizeRequiredDSH(kernel); - auto expectedSizeIOH = HardwareCommandsHelper::getSizeRequiredIOH(kernel, workGroupSize); + auto expectedSizeIOH = HardwareCommandsHelper::getSizeRequiredIOH(kernel, workGroupSize, pClDevice->getRootDeviceEnvironment()); auto expectedSizeSSH = HardwareCommandsHelper::getSizeRequiredSSH(kernel); EXPECT_LE(expectedSizeDSH, blockedCommandsData->dsh->getMaxAvailableSpace()); @@ -1354,8 +1354,9 @@ HWTEST_F(DispatchWalkerTest, WhenKernelRequiresImplicitArgsThenIohRequiresMoreSp multiDispatchInfoWithoutImplicitArgs, CsrDependencies(), walkerArgsWithoutImplicitArgs); + const auto &rootDeviceEnvironment = pClDevice->getRootDeviceEnvironment(); - auto iohSizeWithoutImplicitArgs = HardwareCommandsHelper::getSizeRequiredIOH(kernelWithoutImplicitArgs, workGroupSize); + auto iohSizeWithoutImplicitArgs = HardwareCommandsHelper::getSizeRequiredIOH(kernelWithoutImplicitArgs, workGroupSize, rootDeviceEnvironment); DispatchInfo dispatchInfoWithImplicitArgs(pClDevice, const_cast(&kernelWithImplicitArgs), dimensions, workItems, workGroupSize, globalOffsets); dispatchInfoWithImplicitArgs.setNumberOfWorkgroups({1, 1, 1}); @@ -1370,7 +1371,7 @@ HWTEST_F(DispatchWalkerTest, WhenKernelRequiresImplicitArgsThenIohRequiresMoreSp CsrDependencies(), walkerArgsWithImplicitArgs); - auto iohSizeWithImplicitArgs = HardwareCommandsHelper::getSizeRequiredIOH(kernelWithImplicitArgs, workGroupSize); + auto iohSizeWithImplicitArgs = HardwareCommandsHelper::getSizeRequiredIOH(kernelWithImplicitArgs, workGroupSize, rootDeviceEnvironment); EXPECT_LE(iohSizeWithoutImplicitArgs, iohSizeWithImplicitArgs); @@ -1378,10 +1379,10 @@ HWTEST_F(DispatchWalkerTest, WhenKernelRequiresImplicitArgsThenIohRequiresMoreSp auto numChannels = kernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels; auto simdSize = kernelInfo.getMaxSimdSize(); uint32_t grfSize = sizeof(typename FamilyType::GRF); - const auto &gfxCoreHelper = getHelper(); + auto size = kernelWithImplicitArgs.getCrossThreadDataSize() + - HardwareCommandsHelper::getPerThreadDataSizeTotal(simdSize, grfSize, numChannels, Math::computeTotalElementsCount(workGroupSize), false, gfxCoreHelper) + - ImplicitArgsHelper::getSizeForImplicitArgsPatching(kernelWithImplicitArgs.getImplicitArgs(), kernelWithImplicitArgs.getDescriptor(), false, gfxCoreHelper); + HardwareCommandsHelper::getPerThreadDataSizeTotal(simdSize, grfSize, numChannels, Math::computeTotalElementsCount(workGroupSize), false, rootDeviceEnvironment) + + ImplicitArgsHelper::getSizeForImplicitArgsPatching(kernelWithImplicitArgs.getImplicitArgs(), kernelWithImplicitArgs.getDescriptor(), false, rootDeviceEnvironment); size = alignUp(size, MemoryConstants::cacheLineSize); EXPECT_EQ(size, iohSizeWithImplicitArgs); @@ -1404,14 +1405,14 @@ HWTEST_F(DispatchWalkerTest, WhenKernelRequiresImplicitArgsAndLocalWorkSizeIsSet dispatchInfoWithImplicitArgs.setNumberOfWorkgroups({1, 1, 1}); dispatchInfoWithImplicitArgs.setTotalNumberOfWorkgroups({1, 1, 1}); - auto iohSizeWithImplicitArgsWithoutLWS = HardwareCommandsHelper::getSizeRequiredIOH(kernelWithImplicitArgs, workGroupSize); + auto iohSizeWithImplicitArgsWithoutLWS = HardwareCommandsHelper::getSizeRequiredIOH(kernelWithImplicitArgs, workGroupSize, pClDevice->getRootDeviceEnvironment()); dispatchInfoWithImplicitArgs.setLWS({683, 1, 1}); auto lws = dispatchInfoWithImplicitArgs.getLocalWorkgroupSize(); kernelWithImplicitArgs.setLocalWorkSizeValues(static_cast(lws.x), static_cast(lws.y), static_cast(lws.z)); - auto iohSizeWithImplicitArgsWithLWS = HardwareCommandsHelper::getSizeRequiredIOH(kernelWithImplicitArgs, workGroupSize); + auto iohSizeWithImplicitArgsWithLWS = HardwareCommandsHelper::getSizeRequiredIOH(kernelWithImplicitArgs, workGroupSize, pClDevice->getRootDeviceEnvironment()); EXPECT_LE(iohSizeWithImplicitArgsWithoutLWS, iohSizeWithImplicitArgsWithLWS); } diff --git a/opencl/test/unit_test/command_queue/get_size_required_buffer_tests.cpp b/opencl/test/unit_test/command_queue/get_size_required_buffer_tests.cpp index 25080ec5e6..781339072a 100644 --- a/opencl/test/unit_test/command_queue/get_size_required_buffer_tests.cpp +++ b/opencl/test/unit_test/command_queue/get_size_required_buffer_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -500,7 +500,7 @@ HWTEST_F(GetSizeRequiredBufferTest, GivenHelloWorldKernelWhenEnqueingKernelThenH auto expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, false, false, *pCmdQ, KernelFixture::pKernel, {}); auto expectedSizeDSH = HardwareCommandsHelper::getSizeRequiredDSH(*KernelFixture::pKernel); size_t localWorkSizes[] = {64, 1, 1}; - auto expectedSizeIOH = HardwareCommandsHelper::getSizeRequiredIOH(*KernelFixture::pKernel, localWorkSizes); + auto expectedSizeIOH = HardwareCommandsHelper::getSizeRequiredIOH(*KernelFixture::pKernel, localWorkSizes, pClDevice->getRootDeviceEnvironment()); auto expectedSizeSSH = HardwareCommandsHelper::getSizeRequiredSSH(*KernelFixture::pKernel); // Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended. @@ -540,7 +540,7 @@ HWTEST_F(GetSizeRequiredBufferTest, GivenKernelWithSimpleArgWhenEnqueingKernelTh auto expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, false, false, *pCmdQ, KernelFixture::pKernel, {}); auto expectedSizeDSH = HardwareCommandsHelper::getSizeRequiredDSH(*KernelFixture::pKernel); size_t localWorkSizes[] = {64, 1, 1}; - auto expectedSizeIOH = HardwareCommandsHelper::getSizeRequiredIOH(*KernelFixture::pKernel, localWorkSizes); + auto expectedSizeIOH = HardwareCommandsHelper::getSizeRequiredIOH(*KernelFixture::pKernel, localWorkSizes, pClDevice->getRootDeviceEnvironment()); auto expectedSizeSSH = HardwareCommandsHelper::getSizeRequiredSSH(*KernelFixture::pKernel); EXPECT_EQ(0u, expectedSizeIOH % GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE); diff --git a/opencl/test/unit_test/command_queue/get_size_required_image_tests.cpp b/opencl/test/unit_test/command_queue/get_size_required_image_tests.cpp index 60f4ee90b6..34506a12c2 100644 --- a/opencl/test/unit_test/command_queue/get_size_required_image_tests.cpp +++ b/opencl/test/unit_test/command_queue/get_size_required_image_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -98,7 +98,7 @@ HWTEST_F(GetSizeRequiredImageTest, WhenCopyingImageThenHeapsAndCommandBufferCons auto expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_COPY_IMAGE, false, false, *pCmdQ, kernel, {}); auto expectedSizeDSH = HardwareCommandsHelper::getSizeRequiredDSH(*kernel); size_t localWorkSizes[] = {256, 1, 1}; - auto expectedSizeIOH = HardwareCommandsHelper::getSizeRequiredIOH(*kernel, localWorkSizes); + auto expectedSizeIOH = HardwareCommandsHelper::getSizeRequiredIOH(*kernel, localWorkSizes, pDevice->getRootDeviceEnvironment()); auto expectedSizeSSH = HardwareCommandsHelper::getSizeRequiredSSH(*kernel); // Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended. @@ -146,7 +146,7 @@ HWTEST_F(GetSizeRequiredImageTest, WhenCopyingReadWriteImageThenHeapsAndCommandB auto expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_COPY_IMAGE, false, false, *pCmdQ, kernel.get(), {}); auto expectedSizeDSH = HardwareCommandsHelper::getSizeRequiredDSH(*kernel.get()); size_t localWorkSizes[] = {256, 1, 1}; - auto expectedSizeIOH = HardwareCommandsHelper::getSizeRequiredIOH(*kernel.get(), localWorkSizes); + auto expectedSizeIOH = HardwareCommandsHelper::getSizeRequiredIOH(*kernel.get(), localWorkSizes, pDevice->getRootDeviceEnvironment()); auto expectedSizeSSH = HardwareCommandsHelper::getSizeRequiredSSH(*kernel.get()); // Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended. @@ -204,7 +204,7 @@ HWTEST_F(GetSizeRequiredImageTest, WhenReadingImageNonBlockingThenHeapsAndComman auto expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_READ_IMAGE, false, false, *pCmdQ, kernel, {}); auto expectedSizeDSH = HardwareCommandsHelper::getSizeRequiredDSH(*kernel); size_t localWorkSizes[] = {256, 1, 1}; - auto expectedSizeIOH = HardwareCommandsHelper::getSizeRequiredIOH(*kernel, localWorkSizes); + auto expectedSizeIOH = HardwareCommandsHelper::getSizeRequiredIOH(*kernel, localWorkSizes, pDevice->getRootDeviceEnvironment()); auto expectedSizeSSH = HardwareCommandsHelper::getSizeRequiredSSH(*kernel); // Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended. @@ -260,7 +260,7 @@ HWTEST_F(GetSizeRequiredImageTest, WhenReadingImageBlockingThenHeapsAndCommandBu auto expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_READ_IMAGE, false, false, *pCmdQ, kernel, {}); auto expectedSizeDSH = HardwareCommandsHelper::getSizeRequiredDSH(*kernel); size_t localWorkSizes[] = {256, 1, 1}; - auto expectedSizeIOH = HardwareCommandsHelper::getSizeRequiredIOH(*kernel, localWorkSizes); + auto expectedSizeIOH = HardwareCommandsHelper::getSizeRequiredIOH(*kernel, localWorkSizes, pDevice->getRootDeviceEnvironment()); auto expectedSizeSSH = HardwareCommandsHelper::getSizeRequiredSSH(*kernel); // Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended. @@ -316,7 +316,7 @@ HWTEST_F(GetSizeRequiredImageTest, WhenWritingImageNonBlockingThenHeapsAndComman auto expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_WRITE_IMAGE, false, false, *pCmdQ, kernel, {}); auto expectedSizeDSH = HardwareCommandsHelper::getSizeRequiredDSH(*kernel); size_t localWorkSizes[] = {256, 1, 1}; - auto expectedSizeIOH = HardwareCommandsHelper::getSizeRequiredIOH(*kernel, localWorkSizes); + auto expectedSizeIOH = HardwareCommandsHelper::getSizeRequiredIOH(*kernel, localWorkSizes, pDevice->getRootDeviceEnvironment()); auto expectedSizeSSH = HardwareCommandsHelper::getSizeRequiredSSH(*kernel); // Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended. @@ -372,7 +372,7 @@ HWTEST_F(GetSizeRequiredImageTest, WhenWritingImageBlockingThenHeapsAndCommandBu auto expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_WRITE_IMAGE, false, false, *pCmdQ, kernel, {}); auto expectedSizeDSH = HardwareCommandsHelper::getSizeRequiredDSH(*kernel); size_t localWorkSizes[] = {256, 1, 1}; - auto expectedSizeIOH = HardwareCommandsHelper::getSizeRequiredIOH(*kernel, localWorkSizes); + auto expectedSizeIOH = HardwareCommandsHelper::getSizeRequiredIOH(*kernel, localWorkSizes, pDevice->getRootDeviceEnvironment()); auto expectedSizeSSH = HardwareCommandsHelper::getSizeRequiredSSH(*kernel); // Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended. diff --git a/opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp b/opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp index 4178ac79b4..8960900a35 100644 --- a/opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp +++ b/opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2019-2023 Intel Corporation + * Copyright (C) 2019-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -171,7 +171,9 @@ HWTEST_F(HardwareCommandsTest, WhenCrossThreadDataIsCreatedThenOnlyRequiredSpace *kernel, false, nullptr, - sizeCrossThreadData, 0); + sizeCrossThreadData, + 0, + pClDevice->getRootDeviceEnvironment()); auto usedAfter = indirectHeap.getUsed(); EXPECT_EQ(kernel->getCrossThreadDataSize(), usedAfter - usedBefore); @@ -199,7 +201,8 @@ HWTEST_F(HardwareCommandsTest, givenSendCrossThreadDataWhenWhenAddPatchInfoComme false, nullptr, sizeCrossThreadData, - 0); + 0, + pClDevice->getRootDeviceEnvironment()); ASSERT_EQ(1u, kernel->getPatchInfoDataList().size()); EXPECT_EQ(0xaaaaaaaa, kernel->getPatchInfoDataList()[0].sourceAllocation); @@ -222,7 +225,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenIndirectHeapNotAllocatedF false, nullptr, sizeCrossThreadData, - 0); + 0, + pClDevice->getRootDeviceEnvironment()); EXPECT_EQ(0u, offset); pDevice->getMemoryManager()->freeGraphicsMemory(nonInternalAllocation); } @@ -240,7 +244,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenIndirectHeapAllocatedFrom false, nullptr, sizeCrossThreadData, - 0); + 0, + pClDevice->getRootDeviceEnvironment()); EXPECT_EQ(expectedOffset, offset); pDevice->getMemoryManager()->freeGraphicsMemory(internalAllocation); @@ -275,7 +280,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenSendCrossThreadDataWhenWh false, nullptr, sizeCrossThreadData, - 0); + 0, + pClDevice->getRootDeviceEnvironment()); ASSERT_NE(0u, offsetCrossThreadData); EXPECT_EQ(128u, offsetCrossThreadData); @@ -373,7 +379,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, WhenAllocatingIndirectStateRes auto usedAfterIOH = ioh.getUsed(); auto usedAfterSSH = ssh.getUsed(); auto sizeRequiredDSH = HardwareCommandsHelper::getSizeRequiredDSH(*kernel); - auto sizeRequiredIOH = HardwareCommandsHelper::getSizeRequiredIOH(*kernel, localWorkSizes); + auto sizeRequiredIOH = HardwareCommandsHelper::getSizeRequiredIOH(*kernel, localWorkSizes, pDevice->getRootDeviceEnvironment()); auto sizeRequiredSSH = HardwareCommandsHelper::getSizeRequiredSSH(*kernel); EXPECT_GE(sizeRequiredDSH, usedAfterDSH - usedBeforeDSH); @@ -559,8 +565,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, whenSendingIndirectStateThenKe constexpr uint32_t grfSize = sizeof(typename FamilyType::GRF); size_t localWorkSize = localWorkSizeX * localWorkSizeY * localWorkSizeZ; auto numChannels = modifiedKernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels; - const auto &gfxCoreHelper = pDevice->getGfxCoreHelper(); - size_t expectedIohSize = PerThreadDataHelper::getPerThreadDataSizeTotal(modifiedKernelInfo.getMaxSimdSize(), grfSize, numChannels, localWorkSize, !kernelUsesLocalIds, gfxCoreHelper); + const auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment(); + size_t expectedIohSize = PerThreadDataHelper::getPerThreadDataSizeTotal(modifiedKernelInfo.getMaxSimdSize(), grfSize, numChannels, localWorkSize, !kernelUsesLocalIds, rootDeviceEnvironment); ASSERT_LE(expectedIohSize, ioh.getUsed()); auto expectedLocalIds = alignedMalloc(expectedIohSize, 64); @@ -569,7 +575,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, whenSendingIndirectStateThenKe std::array{{modifiedKernelInfo.kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[0], modifiedKernelInfo.kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[1], modifiedKernelInfo.kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[2]}}, - false, grfSize, gfxCoreHelper); + false, grfSize, rootDeviceEnvironment); EXPECT_EQ(0, memcmp(expectedLocalIds, ioh.getCpuBase(), expectedIohSize)); alignedFree(expectedLocalIds); @@ -1153,8 +1159,8 @@ struct HardwareCommandsImplicitArgsTests : Test { kernel.setGlobalWorkSizeValues(static_cast(expectedImplicitArgs.globalSizeX), static_cast(expectedImplicitArgs.globalSizeY), static_cast(expectedImplicitArgs.globalSizeZ)); kernel.setGlobalWorkOffsetValues(static_cast(expectedImplicitArgs.globalOffsetX), static_cast(expectedImplicitArgs.globalOffsetY), static_cast(expectedImplicitArgs.globalOffsetZ)); kernel.setNumWorkGroupsValues(expectedImplicitArgs.groupCountX, expectedImplicitArgs.groupCountY, expectedImplicitArgs.groupCountZ); - const auto &gfxCoreHelper = pDevice->getGfxCoreHelper(); - implicitArgsProgrammingSize = ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernel.getDescriptor(), false, gfxCoreHelper); + const auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment(); + implicitArgsProgrammingSize = ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernel.getDescriptor(), false, rootDeviceEnvironment); auto sizeCrossThreadData = kernel.getCrossThreadDataSize(); using DefaultWalkerType = typename FamilyType::DefaultWalkerType; @@ -1164,7 +1170,8 @@ struct HardwareCommandsImplicitArgsTests : Test { false, nullptr, sizeCrossThreadData, - 0); + 0, + pClDevice->getRootDeviceEnvironment()); EXPECT_LE(implicitArgsProgrammingSize, indirectHeap.getUsed()); @@ -1218,11 +1225,11 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithI auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth); auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgs::getSize(), MemoryConstants::cacheLineSize); - const auto &gfxCoreHelper = pDevice->getGfxCoreHelper(); - generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize, gfxCoreHelper); + const auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment(); + generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize, rootDeviceEnvironment); auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgs::getSize(); - size_t sizeForLocalIds = PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize, false, gfxCoreHelper); + size_t sizeForLocalIds = PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize, false, rootDeviceEnvironment); EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds)); alignedFree(expectedLocalIds); @@ -1252,11 +1259,11 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithI auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth); auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgs::getSize(), MemoryConstants::cacheLineSize); - const auto &gfxCoreHelper = pDevice->getGfxCoreHelper(); - generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize, gfxCoreHelper); + const auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment(); + generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize, rootDeviceEnvironment); auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgs::getSize(); - size_t sizeForLocalIds = PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize, false, gfxCoreHelper); + size_t sizeForLocalIds = PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize, false, rootDeviceEnvironment); EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds)); alignedFree(expectedLocalIds); @@ -1308,7 +1315,8 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsTestXeHpAndLater, givenIndirectHeap false, nullptr, sizeCrossThreadData, - 0); + 0, + pClDevice->getRootDeviceEnvironment()); EXPECT_EQ(expectedOffset, offset); pDevice->getMemoryManager()->freeGraphicsMemory(nonInternalAllocation); } @@ -1326,7 +1334,8 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsTestXeHpAndLater, givenIndirectHeap false, nullptr, sizeCrossThreadData, - 0); + 0, + pClDevice->getRootDeviceEnvironment()); EXPECT_EQ(expectedOffset, offset); pDevice->getMemoryManager()->freeGraphicsMemory(internalAllocation); @@ -1362,7 +1371,8 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsTestXeHpAndLater, givenSendCrossThr false, nullptr, sizeCrossThreadData, - 0); + 0, + pClDevice->getRootDeviceEnvironment()); auto expectedOffsetRelativeToIohBase = 128u; auto iohBaseAddress = is64bit ? 0u : indirectHeap.getHeapGpuBase(); diff --git a/shared/source/command_container/command_encoder_bdw_and_later.inl b/shared/source/command_container/command_encoder_bdw_and_later.inl index 4dc4e31b2e..fec100bd64 100644 --- a/shared/source/command_container/command_encoder_bdw_and_later.inl +++ b/shared/source/command_container/command_encoder_bdw_and_later.inl @@ -181,7 +181,7 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis uint32_t sizeThreadData = sizePerThreadDataForWholeGroup + sizeCrossThreadData; bool isHwLocalIdGeneration = false; - uint32_t sizeForImplicitArgsPatching = NEO::ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, isHwLocalIdGeneration, gfxCoreHelper); + uint32_t sizeForImplicitArgsPatching = NEO::ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, isHwLocalIdGeneration, rootDeviceEnvironment); uint32_t iohRequiredSize = sizeThreadData + sizeForImplicitArgsPatching; uint64_t offsetThreadData = 0u; { @@ -203,7 +203,7 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis auto implicitArgsCrossThreadPtr = ptrOffset(const_cast(reinterpret_cast(args.dispatchInterface->getCrossThreadData())), kernelDescriptor.payloadMappings.implicitArgs.implicitArgsBuffer); *implicitArgsCrossThreadPtr = implicitArgsGpuVA; - ptr = NEO::ImplicitArgsHelper::patchImplicitArgs(ptr, *pImplicitArgs, kernelDescriptor, {}, gfxCoreHelper); + ptr = NEO::ImplicitArgsHelper::patchImplicitArgs(ptr, *pImplicitArgs, kernelDescriptor, {}, rootDeviceEnvironment); } memcpy_s(ptr, sizeCrossThreadData, diff --git a/shared/source/command_container/command_encoder_xehp_and_later.inl b/shared/source/command_container/command_encoder_xehp_and_later.inl index 47d64485df..cfd7c657f6 100644 --- a/shared/source/command_container/command_encoder_xehp_and_later.inl +++ b/shared/source/command_container/command_encoder_xehp_and_later.inl @@ -240,7 +240,7 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis } uint32_t sizeThreadData = sizePerThreadDataForWholeGroup + sizeCrossThreadData; - uint32_t sizeForImplicitArgsPatching = NEO::ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, !localIdsGenerationByRuntime, gfxCoreHelper); + uint32_t sizeForImplicitArgsPatching = NEO::ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, !localIdsGenerationByRuntime, rootDeviceEnvironment); uint32_t iohRequiredSize = sizeThreadData + sizeForImplicitArgsPatching; { auto heap = container.getIndirectHeap(HeapType::indirectObject); @@ -254,11 +254,11 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis } UNRECOVERABLE_IF(!ptr); offsetThreadData = (is64bit ? heap->getHeapGpuStartOffset() : heap->getHeapGpuBase()) + static_cast(heap->getUsed() - sizeThreadData); - + auto &rootDeviceEnvironment = args.device->getRootDeviceEnvironment(); if (pImplicitArgs) { offsetThreadData -= ImplicitArgs::getSize(); pImplicitArgs->localIdTablePtr = heap->getGraphicsAllocation()->getGpuAddress() + heap->getUsed() - iohRequiredSize; - ptr = NEO::ImplicitArgsHelper::patchImplicitArgs(ptr, *pImplicitArgs, kernelDescriptor, std::make_pair(localIdsGenerationByRuntime, requiredWorkgroupOrder), gfxCoreHelper); + ptr = NEO::ImplicitArgsHelper::patchImplicitArgs(ptr, *pImplicitArgs, kernelDescriptor, std::make_pair(localIdsGenerationByRuntime, requiredWorkgroupOrder), rootDeviceEnvironment); } if (sizeCrossThreadData > 0) { @@ -313,7 +313,7 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis } if (NEO::PauseOnGpuProperties::pauseModeAllowed(NEO::debugManager.flags.PauseOnEnqueue.get(), args.device->debugExecutionCounter.load(), NEO::PauseOnGpuProperties::PauseMode::BeforeWorkload)) { - void *commandBuffer = listCmdBufferStream->getSpace(MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(args.device->getRootDeviceEnvironment(), false)); + void *commandBuffer = listCmdBufferStream->getSpace(MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment, false)); args.additionalCommands->push_back(commandBuffer); EncodeSemaphore::applyMiSemaphoreWaitCommand(*listCmdBufferStream, *args.additionalCommands); diff --git a/shared/source/helpers/aarch64/local_id_gen.cpp b/shared/source/helpers/aarch64/local_id_gen.cpp index 4169785bf7..9231cf2121 100644 --- a/shared/source/helpers/aarch64/local_id_gen.cpp +++ b/shared/source/helpers/aarch64/local_id_gen.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -7,6 +7,7 @@ #include "shared/source/helpers/local_id_gen.h" +#include "shared/source/execution_environment/root_device_environment.h" #include "shared/source/helpers/aligned_memory.h" #include "shared/source/helpers/gfx_core_helper.h" #include "shared/source/helpers/local_id_gen_special.inl" @@ -42,9 +43,10 @@ LocalIDHelper::LocalIDHelper() { LocalIDHelper LocalIDHelper::initializer; -void generateLocalIDs(void *buffer, uint16_t simd, const std::array &localWorkgroupSize, const std::array &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize, const GfxCoreHelper &gfxCoreHelper) { +void generateLocalIDs(void *buffer, uint16_t simd, const std::array &localWorkgroupSize, const std::array &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize, const RootDeviceEnvironment &rootDeviceEnvironment) { bool localIdsGeneratedByHw = false; - auto threadsPerWorkGroup = static_cast(gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]), grfSize, localIdsGeneratedByHw)); + auto &gfxCoreHelper = rootDeviceEnvironment.getHelper(); + auto threadsPerWorkGroup = static_cast(gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]), grfSize, localIdsGeneratedByHw, rootDeviceEnvironment)); bool useLayoutForImages = isImageOnlyKernel && isCompatibleWithLayoutForImages(localWorkgroupSize, dimensionsOrder, simd); if (useLayoutForImages) { generateLocalIDsWithLayoutForImages(buffer, localWorkgroupSize, simd); diff --git a/shared/source/helpers/gfx_core_helper.h b/shared/source/helpers/gfx_core_helper.h index e85fb39da6..858f66e49e 100644 --- a/shared/source/helpers/gfx_core_helper.h +++ b/shared/source/helpers/gfx_core_helper.h @@ -125,7 +125,7 @@ class GfxCoreHelper { virtual bool isCooperativeDispatchSupported(const EngineGroupType engineGroupType, const RootDeviceEnvironment &rootDeviceEnvironment) const = 0; virtual uint32_t adjustMaxWorkGroupCount(uint32_t maxWorkGroupCount, const EngineGroupType engineGroupType, const RootDeviceEnvironment &rootDeviceEnvironment, bool isEngineInstanced) const = 0; - virtual uint32_t adjustMaxWorkGroupSize(const uint32_t numGrf, const uint32_t simd, bool isHwLocalGeneration, const uint32_t defaultMaxGroupSize) const = 0; + virtual uint32_t adjustMaxWorkGroupSize(const uint32_t numGrf, const uint32_t simd, bool isHwLocalGeneration, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const = 0; virtual size_t getMaxFillPaternSizeForCopyEngine() const = 0; virtual size_t getSipKernelMaxDbgSurfaceSize(const HardwareInfo &hwInfo) const = 0; virtual bool isCpuImageTransferPreferred(const HardwareInfo &hwInfo) const = 0; @@ -174,7 +174,7 @@ class GfxCoreHelper { virtual bool isChipsetUniqueUUIDSupported() const = 0; virtual bool isTimestampShiftRequired() const = 0; virtual bool isRelaxedOrderingSupported() const = 0; - virtual uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalIdGeneration) const = 0; + virtual uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalIdGeneration, const RootDeviceEnvironment &rootDeviceEnvironment) const = 0; virtual uint32_t overrideMaxWorkGroupSize(uint32_t maxWG) const = 0; virtual char const *getDefaultDeviceHierarchy() const = 0; static bool isWorkaroundRequired(uint32_t lowestSteppingWithBug, uint32_t steppingWithFix, const HardwareInfo &hwInfo, const ProductHelper &productHelper); @@ -341,7 +341,7 @@ class GfxCoreHelperHw : public GfxCoreHelper { uint32_t adjustMaxWorkGroupCount(uint32_t maxWorkGroupCount, const EngineGroupType engineGroupType, const RootDeviceEnvironment &rootDeviceEnvironment, bool isEngineInstanced) const override; - uint32_t adjustMaxWorkGroupSize(const uint32_t numGrf, const uint32_t simd, bool isHwLocalGeneration, const uint32_t defaultMaxGroupSize) const override; + uint32_t adjustMaxWorkGroupSize(const uint32_t numGrf, const uint32_t simd, bool isHwLocalGeneration, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const override; size_t getMaxFillPaternSizeForCopyEngine() const override; size_t getSipKernelMaxDbgSurfaceSize(const HardwareInfo &hwInfo) const override; @@ -401,7 +401,7 @@ class GfxCoreHelperHw : public GfxCoreHelper { bool isChipsetUniqueUUIDSupported() const override; bool isTimestampShiftRequired() const override; bool isRelaxedOrderingSupported() const override; - uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalIdGeneration) const override; + uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalIdGeneration, const RootDeviceEnvironment &rootDeviceEnvironment) const override; uint32_t overrideMaxWorkGroupSize(uint32_t maxWG) const override; char const *getDefaultDeviceHierarchy() const override; diff --git a/shared/source/helpers/gfx_core_helper_base.inl b/shared/source/helpers/gfx_core_helper_base.inl index c19b814882..523224c37b 100644 --- a/shared/source/helpers/gfx_core_helper_base.inl +++ b/shared/source/helpers/gfx_core_helper_base.inl @@ -707,7 +707,7 @@ uint32_t GfxCoreHelperHw::overrideMaxWorkGroupSize(uint32_t maxWG) co } template -uint32_t GfxCoreHelperHw::adjustMaxWorkGroupSize(const uint32_t numGrf, const uint32_t simd, bool isHwLocalGeneration, const uint32_t defaultMaxGroupSize) const { +uint32_t GfxCoreHelperHw::adjustMaxWorkGroupSize(const uint32_t numGrf, const uint32_t simd, bool isHwLocalGeneration, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const { return defaultMaxGroupSize; } @@ -717,7 +717,7 @@ uint32_t GfxCoreHelperHw::getMinimalGrfSize() const { } template -uint32_t GfxCoreHelperHw::calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalIdGeneration) const { +uint32_t GfxCoreHelperHw::calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalIdGeneration, const RootDeviceEnvironment &rootDeviceEnvironment) const { return getThreadsPerWG(simd, totalWorkItems); } diff --git a/shared/source/helpers/local_id_gen.h b/shared/source/helpers/local_id_gen.h index b3748a1402..725cb8d6f2 100644 --- a/shared/source/helpers/local_id_gen.h +++ b/shared/source/helpers/local_id_gen.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -14,7 +14,7 @@ #include namespace NEO { -class GfxCoreHelper; +struct RootDeviceEnvironment; inline uint32_t getNumGrfsPerLocalIdCoordinate(uint32_t simd, uint32_t grfSize) { return (simd == 32 && grfSize == 32) ? 2 : 1; } @@ -64,7 +64,7 @@ void generateLocalIDsSimd(void *b, const std::array &localWorkgroup const std::array &dimensionsOrder, bool chooseMaxRowSize); void generateLocalIDs(void *buffer, uint16_t simd, const std::array &localWorkgroupSize, - const std::array &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize, const NEO::GfxCoreHelper &gfxCoreHelper); + const std::array &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize, const RootDeviceEnvironment &rootDeviceEnvironment); void generateLocalIDsWithLayoutForImages(void *b, const std::array &localWorkgroupSize, uint16_t simd); bool isCompatibleWithLayoutForImages(const std::array &localWorkgroupSize, const std::array &dimensionsOrder, uint16_t simd); diff --git a/shared/source/helpers/per_thread_data.h b/shared/source/helpers/per_thread_data.h index 174a4fec86..2fdcb9f2cb 100644 --- a/shared/source/helpers/per_thread_data.h +++ b/shared/source/helpers/per_thread_data.h @@ -1,11 +1,12 @@ /* - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation * * SPDX-License-Identifier: MIT * */ #pragma once +#include "shared/source/execution_environment/root_device_environment.h" #include "shared/source/helpers/gfx_core_helper.h" #include "shared/source/helpers/local_id_gen.h" #include "shared/source/helpers/simd_helper.h" @@ -23,12 +24,13 @@ struct PerThreadDataHelper { uint32_t numChannels, size_t localWorkSize, bool isHwLocalIdGeneration, - const GfxCoreHelper &gfxCoreHelper) { + const RootDeviceEnvironment &rootDeviceEnvironment) { auto perThreadSizeLocalIDs = static_cast(getPerThreadSizeLocalIDs(simd, grfSize, numChannels)); if (isSimd1(simd)) { return perThreadSizeLocalIDs * localWorkSize; } - return perThreadSizeLocalIDs * gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast(localWorkSize), grfSize, isHwLocalIdGeneration); + auto &gfxCoreHelper = rootDeviceEnvironment.getHelper(); + return perThreadSizeLocalIDs * gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast(localWorkSize), grfSize, isHwLocalIdGeneration, rootDeviceEnvironment); } }; // namespace PerThreadDataHelper } // namespace NEO diff --git a/shared/source/helpers/x86_64/local_id_gen.cpp b/shared/source/helpers/x86_64/local_id_gen.cpp index 3c7a9def07..9c6e3a5216 100644 --- a/shared/source/helpers/x86_64/local_id_gen.cpp +++ b/shared/source/helpers/x86_64/local_id_gen.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -7,6 +7,7 @@ #include "shared/source/helpers/local_id_gen.h" +#include "shared/source/execution_environment/root_device_environment.h" #include "shared/source/helpers/aligned_memory.h" #include "shared/source/helpers/gfx_core_helper.h" #include "shared/source/helpers/local_id_gen_special.inl" @@ -45,9 +46,10 @@ LocalIDHelper::LocalIDHelper() { LocalIDHelper LocalIDHelper::initializer; // traditional function to generate local IDs -void generateLocalIDs(void *buffer, uint16_t simd, const std::array &localWorkgroupSize, const std::array &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize, const GfxCoreHelper &gfxCoreHelper) { +void generateLocalIDs(void *buffer, uint16_t simd, const std::array &localWorkgroupSize, const std::array &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize, const RootDeviceEnvironment &rootDeviceEnvironment) { bool localIdsGeneratedByHw = false; - auto threadsPerWorkGroup = static_cast(gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]), grfSize, localIdsGeneratedByHw)); + auto &gfxCoreHelper = rootDeviceEnvironment.getHelper(); + auto threadsPerWorkGroup = static_cast(gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]), grfSize, localIdsGeneratedByHw, rootDeviceEnvironment)); bool useLayoutForImages = isImageOnlyKernel && isCompatibleWithLayoutForImages(localWorkgroupSize, dimensionsOrder, simd); if (useLayoutForImages) { generateLocalIDsWithLayoutForImages(buffer, localWorkgroupSize, simd); diff --git a/shared/source/kernel/implicit_args_helper.cpp b/shared/source/kernel/implicit_args_helper.cpp index 2bba449f2e..facceb67a1 100644 --- a/shared/source/kernel/implicit_args_helper.cpp +++ b/shared/source/kernel/implicit_args_helper.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2022-2023 Intel Corporation + * Copyright (C) 2022-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -7,6 +7,7 @@ #include "shared/source/kernel/implicit_args_helper.h" +#include "shared/source/execution_environment/root_device_environment.h" #include "shared/source/helpers/aligned_memory.h" #include "shared/source/helpers/basic_math.h" #include "shared/source/helpers/hw_walk_order.h" @@ -43,7 +44,7 @@ uint32_t getGrfSize(uint32_t simd) { return 32u; } -uint32_t getSizeForImplicitArgsPatching(const ImplicitArgs *pImplicitArgs, const KernelDescriptor &kernelDescriptor, bool isHwLocalIdGeneration, const GfxCoreHelper &gfxCoreHelper) { +uint32_t getSizeForImplicitArgsPatching(const ImplicitArgs *pImplicitArgs, const KernelDescriptor &kernelDescriptor, bool isHwLocalIdGeneration, const RootDeviceEnvironment &rootDeviceEnvironment) { if (!pImplicitArgs) { return 0; } @@ -58,15 +59,15 @@ uint32_t getSizeForImplicitArgsPatching(const ImplicitArgs *pImplicitArgs, const auto itemsInGroup = Math::computeTotalElementsCount(localWorkSize); uint32_t localIdsSizeNeeded = alignUp(static_cast(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal( - simdSize, grfSize, 3u, itemsInGroup, isHwLocalIdGeneration, gfxCoreHelper)), + simdSize, grfSize, 3u, itemsInGroup, isHwLocalIdGeneration, rootDeviceEnvironment)), MemoryConstants::cacheLineSize); return implicitArgsSize + localIdsSizeNeeded; } } -void *patchImplicitArgs(void *ptrToPatch, const ImplicitArgs &implicitArgs, const KernelDescriptor &kernelDescriptor, std::optional> hwGenerationOfLocalIdsParams, const GfxCoreHelper &gfxCoreHelper) { +void *patchImplicitArgs(void *ptrToPatch, const ImplicitArgs &implicitArgs, const KernelDescriptor &kernelDescriptor, std::optional> hwGenerationOfLocalIdsParams, const RootDeviceEnvironment &rootDeviceEnvironment) { auto localIdsGeneratedByHw = hwGenerationOfLocalIdsParams.has_value() ? hwGenerationOfLocalIdsParams.value().first : false; - auto totalSizeToProgram = getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, localIdsGeneratedByHw, gfxCoreHelper); + auto totalSizeToProgram = getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, localIdsGeneratedByHw, rootDeviceEnvironment); auto retVal = ptrOffset(ptrToPatch, totalSizeToProgram); auto patchImplicitArgsBufferInCrossThread = NEO::isValidOffset<>(kernelDescriptor.payloadMappings.implicitArgs.implicitArgsBuffer); @@ -82,7 +83,7 @@ void *patchImplicitArgs(void *ptrToPatch, const ImplicitArgs &implicitArgs, cons static_cast(implicitArgs.localSizeY), static_cast(implicitArgs.localSizeZ)}}, dimensionOrder, - false, grfSize, gfxCoreHelper); + false, grfSize, rootDeviceEnvironment); auto sizeForLocalIdsProgramming = totalSizeToProgram - ImplicitArgs::getSize(); ptrToPatch = ptrOffset(ptrToPatch, sizeForLocalIdsProgramming); } diff --git a/shared/source/kernel/implicit_args_helper.h b/shared/source/kernel/implicit_args_helper.h index 71b07ad8db..f31de5faa4 100644 --- a/shared/source/kernel/implicit_args_helper.h +++ b/shared/source/kernel/implicit_args_helper.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2021-2023 Intel Corporation + * Copyright (C) 2021-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -17,14 +17,14 @@ namespace NEO { struct KernelDescriptor; -class GfxCoreHelper; +struct RootDeviceEnvironment; inline constexpr const char *implicitArgsRelocationSymbolName = "__INTEL_PATCH_CROSS_THREAD_OFFSET_OFF_R0"; namespace ImplicitArgsHelper { std::array getDimensionOrderForLocalIds(const uint8_t *workgroupDimensionsOrder, std::optional> hwGenerationOfLocalIdsParams); uint32_t getGrfSize(uint32_t simd); -uint32_t getSizeForImplicitArgsPatching(const ImplicitArgs *pImplicitArgs, const KernelDescriptor &kernelDescriptor, bool localIdsGeneratedByRuntime, const GfxCoreHelper &gfxCoreHelper); -void *patchImplicitArgs(void *ptrToPatch, const ImplicitArgs &implicitArgs, const KernelDescriptor &kernelDescriptor, std::optional> hwGenerationOfLocalIdsParams, const GfxCoreHelper &gfxCoreHelper); +uint32_t getSizeForImplicitArgsPatching(const ImplicitArgs *pImplicitArgs, const KernelDescriptor &kernelDescriptor, bool localIdsGeneratedByRuntime, const RootDeviceEnvironment &rootDeviceEnvironment); +void *patchImplicitArgs(void *ptrToPatch, const ImplicitArgs &implicitArgs, const KernelDescriptor &kernelDescriptor, std::optional> hwGenerationOfLocalIdsParams, const RootDeviceEnvironment &rootDeviceEnvironment); } // namespace ImplicitArgsHelper } // namespace NEO diff --git a/shared/source/kernel/local_ids_cache.cpp b/shared/source/kernel/local_ids_cache.cpp index b82e324b44..07f21539ec 100644 --- a/shared/source/kernel/local_ids_cache.cpp +++ b/shared/source/kernel/local_ids_cache.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2022-2023 Intel Corporation + * Copyright (C) 2022-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -7,6 +7,7 @@ #include "shared/source/kernel/local_ids_cache.h" +#include "shared/source/execution_environment/root_device_environment.h" #include "shared/source/helpers/aligned_memory.h" #include "shared/source/helpers/basic_math.h" #include "shared/source/helpers/gfx_core_helper.h" @@ -34,12 +35,13 @@ std::unique_lock LocalIdsCache::lock() { return std::unique_lock(setLocalIdsMutex); } -size_t LocalIdsCache::getLocalIdsSizeForGroup(const Vec3 &group, const GfxCoreHelper &gfxCoreHelper) const { +size_t LocalIdsCache::getLocalIdsSizeForGroup(const Vec3 &group, const RootDeviceEnvironment &rootDeviceEnvironment) const { const auto numElementsInGroup = static_cast(Math::computeTotalElementsCount({group[0], group[1], group[2]})); if (isSimd1(simdSize)) { return static_cast(numElementsInGroup * localIdsSizePerThread); } - const auto numberOfThreads = gfxCoreHelper.calculateNumThreadsPerThreadGroup(simdSize, numElementsInGroup, grfSize, false); + auto &gfxCoreHelper = rootDeviceEnvironment.getHelper(); + const auto numberOfThreads = gfxCoreHelper.calculateNumThreadsPerThreadGroup(simdSize, numElementsInGroup, grfSize, false, rootDeviceEnvironment); return static_cast(numberOfThreads * localIdsSizePerThread); } @@ -52,7 +54,7 @@ void LocalIdsCache::setLocalIdsForEntry(LocalIdsCacheEntry &entry, void *destina std::memcpy(destination, entry.localIdsData, entry.localIdsSize); } -void LocalIdsCache::setLocalIdsForGroup(const Vec3 &group, void *destination, const GfxCoreHelper &gfxCoreHelper) { +void LocalIdsCache::setLocalIdsForGroup(const Vec3 &group, void *destination, const RootDeviceEnvironment &rootDeviceEnvironment) { auto setLocalIdsLock = lock(); LocalIdsCacheEntry *leastAccessedEntry = &cache[0]; for (auto &cacheEntry : cache) { @@ -65,12 +67,12 @@ void LocalIdsCache::setLocalIdsForGroup(const Vec3 &group, void *desti } } - commitNewEntry(*leastAccessedEntry, group, gfxCoreHelper); + commitNewEntry(*leastAccessedEntry, group, rootDeviceEnvironment); setLocalIdsForEntry(*leastAccessedEntry, destination); } -void LocalIdsCache::commitNewEntry(LocalIdsCacheEntry &entry, const Vec3 &group, const GfxCoreHelper &gfxCoreHelper) { - entry.localIdsSize = getLocalIdsSizeForGroup(group, gfxCoreHelper); +void LocalIdsCache::commitNewEntry(LocalIdsCacheEntry &entry, const Vec3 &group, const RootDeviceEnvironment &rootDeviceEnvironment) { + entry.localIdsSize = getLocalIdsSizeForGroup(group, rootDeviceEnvironment); entry.groupSize = group; entry.accessCounter = 0U; if (entry.localIdsSize > entry.localIdsSizeAllocated) { @@ -79,7 +81,7 @@ void LocalIdsCache::commitNewEntry(LocalIdsCacheEntry &entry, const Vec3(simdSize), - {group[0], group[1], group[2]}, wgDimOrder, usesOnlyImages, grfSize, gfxCoreHelper); + {group[0], group[1], group[2]}, wgDimOrder, usesOnlyImages, grfSize, rootDeviceEnvironment); } } // namespace NEO \ No newline at end of file diff --git a/shared/source/kernel/local_ids_cache.h b/shared/source/kernel/local_ids_cache.h index b4c42fbcae..cbe7bdc3ce 100644 --- a/shared/source/kernel/local_ids_cache.h +++ b/shared/source/kernel/local_ids_cache.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2022-2023 Intel Corporation + * Copyright (C) 2022-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -12,7 +12,7 @@ #include namespace NEO { -class GfxCoreHelper; +struct RootDeviceEnvironment; class LocalIdsCache { public: struct LocalIdsCacheEntry { @@ -30,13 +30,13 @@ class LocalIdsCache { LocalIdsCache(size_t cacheSize, std::array wgDimOrder, uint8_t simdSize, uint8_t grfSize, bool usesOnlyImages = false); ~LocalIdsCache(); - void setLocalIdsForGroup(const Vec3 &group, void *destination, const GfxCoreHelper &gfxCoreHelper); - size_t getLocalIdsSizeForGroup(const Vec3 &group, const GfxCoreHelper &gfxCoreHelper) const; + void setLocalIdsForGroup(const Vec3 &group, void *destination, const RootDeviceEnvironment &rootDeviceEnvironment); + size_t getLocalIdsSizeForGroup(const Vec3 &group, const RootDeviceEnvironment &rootDeviceEnvironment) const; size_t getLocalIdsSizePerThread() const; protected: void setLocalIdsForEntry(LocalIdsCacheEntry &entry, void *destination); - void commitNewEntry(LocalIdsCacheEntry &entry, const Vec3 &group, const GfxCoreHelper &gfxCoreHelper); + void commitNewEntry(LocalIdsCacheEntry &entry, const Vec3 &group, const RootDeviceEnvironment &rootDeviceEnvironment); std::unique_lock lock(); StackVec cache; diff --git a/shared/test/unit_test/helpers/gfx_core_helper_tests.cpp b/shared/test/unit_test/helpers/gfx_core_helper_tests.cpp index bb24a0de5a..d6bebd40fc 100644 --- a/shared/test/unit_test/helpers/gfx_core_helper_tests.cpp +++ b/shared/test/unit_test/helpers/gfx_core_helper_tests.cpp @@ -1589,25 +1589,27 @@ HWTEST_F(GfxCoreHelperTest, GivenCooperativeEngineSupportedAndNotUsedWhenAdjustM HWTEST_F(GfxCoreHelperTest, givenNumGrfAndSimdSizeWhenAdjustingMaxWorkGroupSizeThenAlwaysReturnDeviceDefault) { const auto &gfxCoreHelper = getHelper(); + const auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment(); constexpr auto defaultMaxGroupSize = 1024u; uint32_t simdSize = 16u; uint32_t isHwLocalIdGeneration = true; uint32_t numGrfRequired = GrfConfig::largeGrfNumber; - EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.adjustMaxWorkGroupSize(numGrfRequired, simdSize, isHwLocalIdGeneration, defaultMaxGroupSize)); + EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.adjustMaxWorkGroupSize(numGrfRequired, simdSize, isHwLocalIdGeneration, defaultMaxGroupSize, rootDeviceEnvironment)); simdSize = 32u; numGrfRequired = GrfConfig::largeGrfNumber; - EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.adjustMaxWorkGroupSize(numGrfRequired, simdSize, isHwLocalIdGeneration, defaultMaxGroupSize)); + EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.adjustMaxWorkGroupSize(numGrfRequired, simdSize, isHwLocalIdGeneration, defaultMaxGroupSize, rootDeviceEnvironment)); simdSize = 16u; isHwLocalIdGeneration = false; numGrfRequired = GrfConfig::defaultGrfNumber; - EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.adjustMaxWorkGroupSize(numGrfRequired, simdSize, isHwLocalIdGeneration, defaultMaxGroupSize)); + EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.adjustMaxWorkGroupSize(numGrfRequired, simdSize, isHwLocalIdGeneration, defaultMaxGroupSize, rootDeviceEnvironment)); } HWTEST2_F(GfxCoreHelperTest, givenParamsWhenCalculateNumThreadsPerThreadGroupThenMethodReturnProperValue, IsAtMostXeHpcCore) { auto &gfxCoreHelper = getHelper(); + const auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment(); std::array, 8> values = {{ {32u, 32u, 1u}, // SIMT Size, totalWorkItems, Max Num of threads {32u, 64u, 2u}, @@ -1620,7 +1622,7 @@ HWTEST2_F(GfxCoreHelperTest, givenParamsWhenCalculateNumThreadsPerThreadGroupThe }}; for (auto &[simtSize, totalWgSize, expectedNumThreadsPerThreadGroup] : values) { - EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(simtSize, totalWgSize, 32u, true)); + EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(simtSize, totalWgSize, 32u, true, rootDeviceEnvironment)); } } @@ -1634,6 +1636,7 @@ HWTEST_F(GfxCoreHelperTest, givenFlagRemoveRestrictionsOnNumberOfThreadsInGpgpuT DebugManagerStateRestore dbgRestore; debugManager.flags.RemoveRestrictionsOnNumberOfThreadsInGpgpuThreadGroup.set(1); const auto &gfxCoreHelper = getHelper(); + const auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment(); std::array, 8> values = {{ {32u, 32u, 128u, 1, 1u}, // SIMT Size, totalWorkItems, Max Num of threads, Grf size, Hw local id generation @@ -1647,7 +1650,7 @@ HWTEST_F(GfxCoreHelperTest, givenFlagRemoveRestrictionsOnNumberOfThreadsInGpgpuT }}; for (auto &[simtSize, totalWgSize, grfsize, isHwLocalIdGeneration, expectedNumThreadsPerThreadGroup] : values) { - EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(simtSize, totalWgSize, grfsize, isHwLocalIdGeneration)); + EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(simtSize, totalWgSize, grfsize, isHwLocalIdGeneration, rootDeviceEnvironment)); } } diff --git a/shared/test/unit_test/helpers/local_id_tests.cpp b/shared/test/unit_test/helpers/local_id_tests.cpp index 0c78eef29f..ccd158da9d 100644 --- a/shared/test/unit_test/helpers/local_id_tests.cpp +++ b/shared/test/unit_test/helpers/local_id_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -12,6 +12,7 @@ #include "shared/source/helpers/ptr_math.h" #include "shared/test/common/helpers/default_hw_info.h" #include "shared/test/common/helpers/unit_test_helper.h" +#include "shared/test/common/mocks/mock_execution_environment.h" #include "shared/test/common/test_macros/hw_test.h" #include @@ -112,8 +113,9 @@ TEST(LocalIdTest, givenVariadicGrfSizeWhenLocalSizesAreEmittedThenUseFullRowSize std::array localSizes = {{2u, 2u, 1u}}; std::array dimensionsOrder = {{0u, 1u, 2u}}; - auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily); - generateLocalIDs(localIdsPtr.get(), 16u, localSizes, dimensionsOrder, false, 64u, *gfxCoreHelper.get()); + NEO::MockExecutionEnvironment mockExecutionEnvironment{}; + auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0]; + generateLocalIDs(localIdsPtr.get(), 16u, localSizes, dimensionsOrder, false, 64u, rootDeviceEnvironment); EXPECT_EQ(localIdsView[0], 0u); EXPECT_EQ(localIdsView[1], 1u); EXPECT_EQ(localIdsView[2], 0u); @@ -308,42 +310,47 @@ struct LocalIDFixture : ::testing::TestWithParamplatform.eRenderCoreFamily); + NEO::MockExecutionEnvironment mockExecutionEnvironment{}; + auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0]; generateLocalIDs(buffer, simd, std::array{{static_cast(localWorkSizeX), static_cast(localWorkSizeY), static_cast(localWorkSizeZ)}}, - std::array{{0, 1, 2}}, false, grfSize, *gfxCoreHelper.get()); + std::array{{0, 1, 2}}, false, grfSize, rootDeviceEnvironment); validateIDWithinLimits(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, UnitTestHelper::useFullRowForLocalIdsGeneration); } HWTEST_P(LocalIDFixture, WhenGeneratingLocalIdsThenAllWorkItemsCovered) { - auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily); + NEO::MockExecutionEnvironment mockExecutionEnvironment{}; + auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0]; generateLocalIDs(buffer, simd, std::array{{static_cast(localWorkSizeX), static_cast(localWorkSizeY), static_cast(localWorkSizeZ)}}, - std::array{{0, 1, 2}}, false, grfSize, *gfxCoreHelper.get()); + std::array{{0, 1, 2}}, false, grfSize, rootDeviceEnvironment); validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, UnitTestHelper::useFullRowForLocalIdsGeneration); } HWTEST_P(LocalIDFixture, WhenWalkOrderIsXyzThenProperLocalIdsAreGenerated) { auto dimensionsOrder = std::array{{0, 1, 2}}; - auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily); + NEO::MockExecutionEnvironment mockExecutionEnvironment{}; + auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0]; generateLocalIDs(buffer, simd, std::array{{static_cast(localWorkSizeX), static_cast(localWorkSizeY), static_cast(localWorkSizeZ)}}, - dimensionsOrder, false, grfSize, *gfxCoreHelper.get()); + dimensionsOrder, false, grfSize, rootDeviceEnvironment); validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, UnitTestHelper::useFullRowForLocalIdsGeneration); validateWalkOrder(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, dimensionsOrder); } HWTEST_P(LocalIDFixture, WhenWalkOrderIsYxzThenProperLocalIdsAreGenerated) { auto dimensionsOrder = std::array{{1, 0, 2}}; - auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily); + NEO::MockExecutionEnvironment mockExecutionEnvironment{}; + auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0]; generateLocalIDs(buffer, simd, std::array{{static_cast(localWorkSizeX), static_cast(localWorkSizeY), static_cast(localWorkSizeZ)}}, - dimensionsOrder, false, grfSize, *gfxCoreHelper.get()); + dimensionsOrder, false, grfSize, rootDeviceEnvironment); validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, UnitTestHelper::useFullRowForLocalIdsGeneration); validateWalkOrder(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, dimensionsOrder); } HWTEST_P(LocalIDFixture, WhenWalkOrderIsZyxThenProperLocalIdsAreGenerated) { auto dimensionsOrder = std::array{{2, 1, 0}}; - auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily); + NEO::MockExecutionEnvironment mockExecutionEnvironment{}; + auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0]; generateLocalIDs(buffer, simd, std::array{{static_cast(localWorkSizeX), static_cast(localWorkSizeY), static_cast(localWorkSizeZ)}}, - dimensionsOrder, false, grfSize, *gfxCoreHelper.get()); + dimensionsOrder, false, grfSize, rootDeviceEnvironment); validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, UnitTestHelper::useFullRowForLocalIdsGeneration); validateWalkOrder(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, dimensionsOrder); } @@ -383,8 +390,9 @@ struct LocalIdsLayoutForImagesTest : ::testing::TestWithParam(memory.get()); EXPECT_TRUE(isCompatibleWithLayoutForImages(localWorkSize, dimensionsOrder, simd)); - auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily); - generateLocalIDs(buffer, simd, localWorkSize, dimensionsOrder, true, grfSize, *gfxCoreHelper.get()); + NEO::MockExecutionEnvironment mockExecutionEnvironment{}; + auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0]; + generateLocalIDs(buffer, simd, localWorkSize, dimensionsOrder, true, grfSize, rootDeviceEnvironment); } void validateGRF() { uint32_t totalLocalIds = localWorkSize.at(0) * localWorkSize.at(1); @@ -484,9 +492,10 @@ TEST_P(LocalIdsLayoutTest, givenLocalWorkgroupSize4x4x1WhenGenerateLocalIdsThenH auto alignedMemory2 = allocateAlignedMemory(size, 32); auto buffer2 = reinterpret_cast(alignedMemory2.get()); memset(buffer2, 0xff, size); - auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily); - generateLocalIDs(buffer1, simd, localWorkSize, dimensionsOrder, false, grfSize, *gfxCoreHelper.get()); - generateLocalIDs(buffer2, simd, localWorkSize, dimensionsOrder, true, grfSize, *gfxCoreHelper.get()); + NEO::MockExecutionEnvironment mockExecutionEnvironment{}; + auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0]; + generateLocalIDs(buffer1, simd, localWorkSize, dimensionsOrder, false, grfSize, rootDeviceEnvironment); + generateLocalIDs(buffer2, simd, localWorkSize, dimensionsOrder, true, grfSize, rootDeviceEnvironment); for (auto i = 0u; i < elemsInBuffer / rowWidth; i++) { for (auto j = 0u; j < rowWidth; j++) { diff --git a/shared/test/unit_test/kernel/implicit_args_helper_tests.cpp b/shared/test/unit_test/kernel/implicit_args_helper_tests.cpp index ecf3389085..f1b592250f 100644 --- a/shared/test/unit_test/kernel/implicit_args_helper_tests.cpp +++ b/shared/test/unit_test/kernel/implicit_args_helper_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2022-2023 Intel Corporation + * Copyright (C) 2022-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -13,6 +13,7 @@ #include "shared/source/kernel/implicit_args_helper.h" #include "shared/source/kernel/kernel_descriptor.h" #include "shared/test/common/helpers/default_hw_info.h" +#include "shared/test/common/mocks/mock_execution_environment.h" #include "shared/test/common/test_macros/hw_test.h" using namespace NEO; @@ -57,8 +58,9 @@ TEST(ImplicitArgsHelperTest, givenSimdGreaterThanOneWhenGettingGrfSizeThenGrfSiz TEST(ImplicitArgsHelperTest, givenNoImplicitArgsWhenGettingSizeForImplicitArgsProgrammingThenZeroIsReturned) { KernelDescriptor kernelDescriptor{}; - auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily); - EXPECT_EQ(0u, ImplicitArgsHelper::getSizeForImplicitArgsPatching(nullptr, kernelDescriptor, false, *gfxCoreHelper.get())); + NEO::MockExecutionEnvironment mockExecutionEnvironment{}; + auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0]; + EXPECT_EQ(0u, ImplicitArgsHelper::getSizeForImplicitArgsPatching(nullptr, kernelDescriptor, false, rootDeviceEnvironment)); } TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInPayloadMappingWhenGettingSizeForImplicitArgsProgrammingThenCorrectSizeIsReturned) { @@ -75,9 +77,10 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInP auto totalWorkgroupSize = implicitArgs.localSizeX * implicitArgs.localSizeY * implicitArgs.localSizeZ; - auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily); - auto localIdsSize = alignUp(PerThreadDataHelper::getPerThreadDataSizeTotal(implicitArgs.simdWidth, 32u /* grfSize */, 3u /* num channels */, totalWorkgroupSize, false, *gfxCoreHelper.get()), MemoryConstants::cacheLineSize); - EXPECT_EQ(localIdsSize + ImplicitArgs::getSize(), ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, *gfxCoreHelper.get())); + NEO::MockExecutionEnvironment mockExecutionEnvironment{}; + auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0]; + auto localIdsSize = alignUp(PerThreadDataHelper::getPerThreadDataSizeTotal(implicitArgs.simdWidth, 32u /* grfSize */, 3u /* num channels */, totalWorkgroupSize, false, rootDeviceEnvironment), MemoryConstants::cacheLineSize); + EXPECT_EQ(localIdsSize + ImplicitArgs::getSize(), ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, rootDeviceEnvironment)); } TEST(ImplicitArgsHelperTest, givenImplicitArgsWithImplicitArgsBufferOffsetInPayloadMappingWhenGettingSizeForImplicitArgsProgrammingThenCorrectSizeIsReturned) { @@ -91,8 +94,9 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithImplicitArgsBufferOffsetInPayl implicitArgs.localSizeX = 2; implicitArgs.localSizeY = 3; implicitArgs.localSizeZ = 4; - auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily); - EXPECT_EQ(alignUp(implicitArgs.structSize, MemoryConstants::cacheLineSize), ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, *gfxCoreHelper.get())); + NEO::MockExecutionEnvironment mockExecutionEnvironment{}; + auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0]; + EXPECT_EQ(alignUp(implicitArgs.structSize, MemoryConstants::cacheLineSize), ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, rootDeviceEnvironment)); } TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInPayloadMappingWhenPatchingImplicitArgsThenOnlyProperRegionIsPatched) { @@ -109,8 +113,9 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInP implicitArgs.localSizeX = 2; implicitArgs.localSizeY = 3; implicitArgs.localSizeZ = 4; - auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily); - auto totalSizeForPatching = ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, *gfxCoreHelper.get()); + NEO::MockExecutionEnvironment mockExecutionEnvironment{}; + auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0]; + auto totalSizeForPatching = ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, rootDeviceEnvironment); auto totalWorkgroupSize = implicitArgs.localSizeX * implicitArgs.localSizeY * implicitArgs.localSizeZ; auto localIdsPatchingSize = totalWorkgroupSize * 3 * sizeof(uint16_t); @@ -121,7 +126,7 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInP memset(memoryToPatch.get(), pattern, totalSizeForPatching); - auto retVal = ImplicitArgsHelper::patchImplicitArgs(memoryToPatch.get(), implicitArgs, kernelDescriptor, {}, *gfxCoreHelper.get()); + auto retVal = ImplicitArgsHelper::patchImplicitArgs(memoryToPatch.get(), implicitArgs, kernelDescriptor, {}, rootDeviceEnvironment); EXPECT_EQ(retVal, ptrOffset(memoryToPatch.get(), totalSizeForPatching)); @@ -151,8 +156,9 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithImplicitArgsBufferOffsetInPayl implicitArgs.localSizeX = 2; implicitArgs.localSizeY = 3; implicitArgs.localSizeZ = 4; - auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily); - auto totalSizeForPatching = ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, *gfxCoreHelper.get()); + NEO::MockExecutionEnvironment mockExecutionEnvironment{}; + auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0]; + auto totalSizeForPatching = ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, rootDeviceEnvironment); EXPECT_EQ(alignUp(ImplicitArgs::getSize(), MemoryConstants::cacheLineSize), totalSizeForPatching); @@ -162,7 +168,7 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithImplicitArgsBufferOffsetInPayl memset(memoryToPatch.get(), pattern, totalSizeForPatching); - auto retVal = ImplicitArgsHelper::patchImplicitArgs(memoryToPatch.get(), implicitArgs, kernelDescriptor, {}, *gfxCoreHelper.get()); + auto retVal = ImplicitArgsHelper::patchImplicitArgs(memoryToPatch.get(), implicitArgs, kernelDescriptor, {}, rootDeviceEnvironment); EXPECT_EQ(retVal, ptrOffset(memoryToPatch.get(), totalSizeForPatching)); diff --git a/shared/test/unit_test/kernel/local_ids_cache_tests.cpp b/shared/test/unit_test/kernel/local_ids_cache_tests.cpp index 2f4f96a0b3..fa5bbadd33 100644 --- a/shared/test/unit_test/kernel/local_ids_cache_tests.cpp +++ b/shared/test/unit_test/kernel/local_ids_cache_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2022-2023 Intel Corporation + * Copyright (C) 2022-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -12,6 +12,7 @@ #include "shared/source/helpers/per_thread_data.h" #include "shared/source/kernel/local_ids_cache.h" #include "shared/test/common/helpers/default_hw_info.h" +#include "shared/test/common/mocks/mock_execution_environment.h" #include "shared/test/common/mocks/mock_graphics_allocation.h" #include "shared/test/common/test_macros/test.h" @@ -38,8 +39,9 @@ using LocalIdsCacheTests = Test; TEST_F(LocalIdsCacheTests, GivenCacheMissWhenGetLocalIdsForGroupThenNewEntryIsCommitedIntoLeastUsedEntry) { localIdsCache->cache.resize(2); localIdsCache->cache[0].accessCounter = 2U; - auto gfxCoreHelper = NEO::GfxCoreHelper::create(NEO::defaultHwInfo->platform.eRenderCoreFamily); - localIdsCache->setLocalIdsForGroup(groupSize, perThreadData.data(), *gfxCoreHelper.get()); + NEO::MockExecutionEnvironment mockExecutionEnvironment{}; + auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0]; + localIdsCache->setLocalIdsForGroup(groupSize, perThreadData.data(), rootDeviceEnvironment); EXPECT_EQ(groupSize, localIdsCache->cache[1].groupSize); EXPECT_NE(nullptr, localIdsCache->cache[1].localIdsData); @@ -54,8 +56,9 @@ TEST_F(LocalIdsCacheTests, GivenEntryInCacheWhenGetLocalIdsForGroupThenEntryFrom localIdsCache->cache[0].localIdsSize = 512U; localIdsCache->cache[0].localIdsSizeAllocated = 512U; localIdsCache->cache[0].accessCounter = 1U; - auto gfxCoreHelper = NEO::GfxCoreHelper::create(NEO::defaultHwInfo->platform.eRenderCoreFamily); - localIdsCache->setLocalIdsForGroup(groupSize, perThreadData.data(), *gfxCoreHelper.get()); + NEO::MockExecutionEnvironment mockExecutionEnvironment{}; + auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0]; + localIdsCache->setLocalIdsForGroup(groupSize, perThreadData.data(), rootDeviceEnvironment); EXPECT_EQ(2U, localIdsCache->cache[0].accessCounter); } @@ -68,8 +71,9 @@ TEST_F(LocalIdsCacheTests, GivenEntryWithBiggerBufferAllocatedWhenGetLocalIdsFor const auto localIdsData = localIdsCache->cache[0].localIdsData; groupSize = {2, 1, 1}; - auto gfxCoreHelper = NEO::GfxCoreHelper::create(NEO::defaultHwInfo->platform.eRenderCoreFamily); - localIdsCache->setLocalIdsForGroup(groupSize, perThreadData.data(), *gfxCoreHelper.get()); + NEO::MockExecutionEnvironment mockExecutionEnvironment{}; + auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0]; + localIdsCache->setLocalIdsForGroup(groupSize, perThreadData.data(), rootDeviceEnvironment); EXPECT_EQ(1U, localIdsCache->cache[0].accessCounter); EXPECT_EQ(192U, localIdsCache->cache[0].localIdsSize); EXPECT_EQ(512U, localIdsCache->cache[0].localIdsSizeAllocated); @@ -82,16 +86,18 @@ TEST_F(LocalIdsCacheTests, GivenValidLocalIdsCacheWhenGettingLocalIdsSizePerThre } TEST_F(LocalIdsCacheTests, GivenValidLocalIdsCacheWhenGettingLocalIdsSizeForGroupThenCorrectValueIsReturned) { - auto gfxCoreHelper = NEO::GfxCoreHelper::create(NEO::defaultHwInfo->platform.eRenderCoreFamily); - auto localIdsSizePerThread = localIdsCache->getLocalIdsSizeForGroup(groupSize, *gfxCoreHelper.get()); + NEO::MockExecutionEnvironment mockExecutionEnvironment{}; + auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0]; + auto localIdsSizePerThread = localIdsCache->getLocalIdsSizeForGroup(groupSize, rootDeviceEnvironment); EXPECT_EQ(1536U, localIdsSizePerThread); } TEST(LocalIdsCacheTest, givenSimd1WhenGettingLocalIdsSizeForGroupThenCorrectValueIsReturned) { - auto gfxCoreHelper = NEO::GfxCoreHelper::create(NEO::defaultHwInfo->platform.eRenderCoreFamily); + NEO::MockExecutionEnvironment mockExecutionEnvironment{}; + auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0]; auto localIdsCache = std::make_unique(1u, 1u); Vec3 groupSize = {128, 2, 1}; - auto localIdsSizePerThread = localIdsCache->getLocalIdsSizeForGroup(groupSize, *gfxCoreHelper.get()); + auto localIdsSizePerThread = localIdsCache->getLocalIdsSizeForGroup(groupSize, rootDeviceEnvironment); auto expectedLocalIdsSizePerThread = groupSize[0] * groupSize[1] * groupSize[2] * localIdsCache->getLocalIdsSizePerThread(); EXPECT_EQ(expectedLocalIdsSizePerThread, localIdsSizePerThread); } \ No newline at end of file