diff --git a/level_zero/core/source/kernel/kernel_imp.cpp b/level_zero/core/source/kernel/kernel_imp.cpp index c882b06643..c7f369fd50 100644 --- a/level_zero/core/source/kernel/kernel_imp.cpp +++ b/level_zero/core/source/kernel/kernel_imp.cpp @@ -363,7 +363,7 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY, static_cast(groupSizeY), static_cast(groupSizeZ)}}, std::array{{0, 1, 2}}, - false, grfSize); + false, grfSize, gfxCoreHelper); } this->perThreadDataSize = perThreadDataSizeForWholeThreadGroup / numThreadsPerThreadGroup; diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp index c814e6b6b3..907aadb0fb 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp @@ -1023,7 +1023,8 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CmdlistAppendLaunchKernelWithImplicitArgsTests, giv auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth); auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - sizeof(ImplicitArgs), MemoryConstants::cacheLineSize); - generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize); + const auto &gfxCoreHelper = device->getNEODevice()->getGfxCoreHelper(); + generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize, gfxCoreHelper); auto localIdsProgrammingSize = implicitArgsProgrammingSize - sizeof(ImplicitArgs); size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize); @@ -1068,7 +1069,8 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CmdlistAppendLaunchKernelWithImplicitArgsTests, giv auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth); auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - sizeof(ImplicitArgs), MemoryConstants::cacheLineSize); - generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize); + const auto &gfxCoreHelper = device->getNEODevice()->getGfxCoreHelper(); + generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize, gfxCoreHelper); auto localIdsProgrammingSize = implicitArgsProgrammingSize - sizeof(ImplicitArgs); size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize); diff --git a/opencl/source/helpers/hardware_commands_helper_bdw_and_later.inl b/opencl/source/helpers/hardware_commands_helper_bdw_and_later.inl index 15c5dab2d9..44199f2dd0 100644 --- a/opencl/source/helpers/hardware_commands_helper_bdw_and_later.inl +++ b/opencl/source/helpers/hardware_commands_helper_bdw_and_later.inl @@ -91,7 +91,8 @@ size_t HardwareCommandsHelper::sendCrossThreadData( auto implicitArgsGpuVA = indirectHeap.getGraphicsAllocation()->getGpuAddress() + indirectHeap.getUsed(); auto ptrToPatchImplicitArgs = indirectHeap.getSpace(sizeForImplicitArgsProgramming); - ImplicitArgsHelper::patchImplicitArgs(ptrToPatchImplicitArgs, *pImplicitArgs, kernelDescriptor, {}); + const auto &gfxCoreHelper = kernel.getGfxCoreHelper(); + ImplicitArgsHelper::patchImplicitArgs(ptrToPatchImplicitArgs, *pImplicitArgs, kernelDescriptor, {}, gfxCoreHelper); auto implicitArgsCrossThreadPtr = ptrOffset(reinterpret_cast(kernel.getCrossThreadData()), kernelDescriptor.payloadMappings.implicitArgs.implicitArgsBuffer); *implicitArgsCrossThreadPtr = implicitArgsGpuVA; diff --git a/opencl/source/helpers/hardware_commands_helper_xehp_and_later.inl b/opencl/source/helpers/hardware_commands_helper_xehp_and_later.inl index c463a5a4f9..7dd4e52ca4 100644 --- a/opencl/source/helpers/hardware_commands_helper_xehp_and_later.inl +++ b/opencl/source/helpers/hardware_commands_helper_xehp_and_later.inl @@ -96,8 +96,8 @@ size_t HardwareCommandsHelper::sendCrossThreadData( kernelAttributes.flags.requiresWorkgroupWalkOrder, requiredWalkOrder, kernelDescriptor.kernelAttributes.simdSize); - - ImplicitArgsHelper::patchImplicitArgs(ptrToPatchImplicitArgs, *pImplicitArgs, kernelDescriptor, std::make_pair(generationOfLocalIdsByRuntime, requiredWalkOrder)); + const auto &gfxCoreHelper = kernel.getGfxCoreHelper(); + ImplicitArgsHelper::patchImplicitArgs(ptrToPatchImplicitArgs, *pImplicitArgs, kernelDescriptor, std::make_pair(generationOfLocalIdsByRuntime, requiredWalkOrder), gfxCoreHelper); } using InlineData = typename GfxFamily::INLINE_DATA; diff --git a/opencl/source/kernel/kernel.cpp b/opencl/source/kernel/kernel.cpp index 66961d203d..b9c6f0414a 100644 --- a/opencl/source/kernel/kernel.cpp +++ b/opencl/source/kernel/kernel.cpp @@ -581,7 +581,7 @@ cl_int Kernel::getWorkGroupInfo(cl_kernel_work_group_info paramName, cl_ulong privateMemSize; size_t maxWorkgroupSize; const auto &hwInfo = clDevice.getHardwareInfo(); - auto &gfxCoreHelper = clDevice.getGfxCoreHelper(); + auto &gfxCoreHelper = this->getGfxCoreHelper(); auto &clGfxCoreHelper = clDevice.getRootDeviceEnvironment().getHelper(); GetInfoHelper info(paramValue, paramValueSize, paramValueSizeRet); @@ -1537,7 +1537,7 @@ cl_int Kernel::setArgBuffer(uint32_t argIndex, disableL3, isAuxTranslationKernel, arg.isReadOnly(), pClDevice->getDevice(), kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, areMultipleSubDevicesInContext()); } else if (isValidOffset(argAsPtr.bindless)) { - auto &gfxCoreHelper = getDevice().getGfxCoreHelper(); + auto &gfxCoreHelper = this->getGfxCoreHelper(); auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize(); auto surfaceState = ptrOffset(getSurfaceStateHeap(), surfaceStateSize * argIndex); @@ -1655,7 +1655,7 @@ cl_int Kernel::setArgImageWithMipLevel(uint32_t argIndex, void *surfaceState = nullptr; if (isValidOffset(argAsImg.bindless)) { - auto &gfxCoreHelper = getDevice().getGfxCoreHelper(); + auto &gfxCoreHelper = this->getGfxCoreHelper(); auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize(); surfaceState = ptrOffset(getSurfaceStateHeap(), surfaceStateSize * argIndex); } else { @@ -2130,7 +2130,7 @@ uint64_t Kernel::getKernelStartAddress(const bool localIdsGenerationByRuntime, c kernelStartOffset += getStartOffset(); auto &hardwareInfo = getHardwareInfo(); - const auto &gfxCoreHelper = getDevice().getGfxCoreHelper(); + const auto &gfxCoreHelper = this->getGfxCoreHelper(); const auto &productHelper = getDevice().getProductHelper(); if (isCssUsed && gfxCoreHelper.isOffsetToSkipSetFFIDGPWARequired(hardwareInfo, productHelper)) { @@ -2140,7 +2140,7 @@ uint64_t Kernel::getKernelStartAddress(const bool localIdsGenerationByRuntime, c return kernelStartOffset; } void *Kernel::patchBindlessSurfaceState(NEO::GraphicsAllocation *alloc, uint32_t bindless) { - auto &gfxCoreHelper = getDevice().getGfxCoreHelper(); + auto &gfxCoreHelper = this->getGfxCoreHelper(); auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize(); NEO::BindlessHeapsHelper *bindlessHeapsHelper = getDevice().getDevice().getBindlessHeapsHelper(); auto ssInHeap = bindlessHeapsHelper->allocateSSInHeap(surfaceStateSize, alloc, NEO::BindlessHeapsHelper::GLOBAL_SSH); @@ -2159,7 +2159,7 @@ uint32_t Kernel::getAdditionalKernelExecInfo() const { } bool Kernel::requiresWaDisableRccRhwoOptimization() const { - auto &gfxCoreHelper = getDevice().getGfxCoreHelper(); + auto &gfxCoreHelper = this->getGfxCoreHelper(); auto rootDeviceIndex = getDevice().getRootDeviceIndex(); if (gfxCoreHelper.isWaDisableRccRhwoOptimizationRequired() && isUsingSharedObjArgs()) { @@ -2270,7 +2270,7 @@ void Kernel::reconfigureKernel() { kernelDescriptor.kernelAttributes.simdSize != 32) { this->maxKernelWorkGroupSize >>= 1; } - const auto &gfxCoreHelper = getDevice().getGfxCoreHelper(); + const auto &gfxCoreHelper = this->getGfxCoreHelper(); bool isLocalIdsGeneratedByHw = false; // if local ids generated by runtime then more work groups available auto maxWorkGroupSize = static_cast(kernelInfo.getMaxRequiredWorkGroupSize(getMaxKernelWorkGroupSize())); @@ -2369,7 +2369,8 @@ void Kernel::initializeLocalIdsCache() { void Kernel::setLocalIdsForGroup(const Vec3 &groupSize, void *destination) const { UNRECOVERABLE_IF(localIdsCache.get() == nullptr); - localIdsCache->setLocalIdsForGroup(groupSize, destination); + const auto &gfxCoreHelper = this->getGfxCoreHelper(); + localIdsCache->setLocalIdsForGroup(groupSize, destination, gfxCoreHelper); } size_t Kernel::getLocalIdsSizeForGroup(const Vec3 &groupSize) const { diff --git a/opencl/source/kernel/kernel.h b/opencl/source/kernel/kernel.h index 94f100c8fc..b86ba4d0c5 100644 --- a/opencl/source/kernel/kernel.h +++ b/opencl/source/kernel/kernel.h @@ -410,6 +410,10 @@ class Kernel : public ReferenceTrackedObject { size_t getLocalIdsSizeForGroup(const Vec3 &groupSize) const; size_t getLocalIdsSizePerThread() const; + const GfxCoreHelper &getGfxCoreHelper() const { + return getDevice().getGfxCoreHelper(); + } + protected: struct KernelConfig { Vec3 gws; diff --git a/opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp b/opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp index 7afbccf43f..76dbfdd63c 100644 --- a/opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp +++ b/opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp @@ -552,12 +552,13 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, whenSendingIndirectStateThenKe ASSERT_LE(expectedIohSize, ioh.getUsed()); auto expectedLocalIds = alignedMalloc(expectedIohSize, 64); + const auto &gfxCoreHelper = pDevice->getGfxCoreHelper(); generateLocalIDs(expectedLocalIds, modifiedKernelInfo.getMaxSimdSize(), std::array{{localWorkSizeX, localWorkSizeY, localWorkSizeZ}}, std::array{{modifiedKernelInfo.kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[0], modifiedKernelInfo.kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[1], modifiedKernelInfo.kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[2]}}, - false, grfSize); + false, grfSize, gfxCoreHelper); EXPECT_EQ(0, memcmp(expectedLocalIds, ioh.getCpuBase(), expectedIohSize)); alignedFree(expectedLocalIds); @@ -1377,7 +1378,8 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithI auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth); auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - sizeof(ImplicitArgs), MemoryConstants::cacheLineSize); - generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize); + const auto &gfxCoreHelper = pDevice->getGfxCoreHelper(); + generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize, gfxCoreHelper); auto localIdsProgrammingSize = implicitArgsProgrammingSize - sizeof(ImplicitArgs); size_t sizeForLocalIds = PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize); @@ -1410,7 +1412,8 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithI auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth); auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - sizeof(ImplicitArgs), MemoryConstants::cacheLineSize); - generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize); + const auto &gfxCoreHelper = pDevice->getGfxCoreHelper(); + generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize, gfxCoreHelper); auto localIdsProgrammingSize = implicitArgsProgrammingSize - sizeof(ImplicitArgs); size_t sizeForLocalIds = PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize); diff --git a/shared/source/command_container/command_encoder_bdw_and_later.inl b/shared/source/command_container/command_encoder_bdw_and_later.inl index 0722f99a9b..19c7e9eef6 100644 --- a/shared/source/command_container/command_encoder_bdw_and_later.inl +++ b/shared/source/command_container/command_encoder_bdw_and_later.inl @@ -183,7 +183,7 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis auto implicitArgsCrossThreadPtr = ptrOffset(const_cast(reinterpret_cast(args.dispatchInterface->getCrossThreadData())), kernelDescriptor.payloadMappings.implicitArgs.implicitArgsBuffer); *implicitArgsCrossThreadPtr = implicitArgsGpuVA; - ptr = NEO::ImplicitArgsHelper::patchImplicitArgs(ptr, *pImplicitArgs, kernelDescriptor, {}); + ptr = NEO::ImplicitArgsHelper::patchImplicitArgs(ptr, *pImplicitArgs, kernelDescriptor, {}, gfxCoreHelper); } memcpy_s(ptr, sizeCrossThreadData, diff --git a/shared/source/command_container/command_encoder_xehp_and_later.inl b/shared/source/command_container/command_encoder_xehp_and_later.inl index d6e711509e..a14be7c67f 100644 --- a/shared/source/command_container/command_encoder_xehp_and_later.inl +++ b/shared/source/command_container/command_encoder_xehp_and_later.inl @@ -218,7 +218,7 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis if (pImplicitArgs) { offsetThreadData -= sizeof(ImplicitArgs); pImplicitArgs->localIdTablePtr = heap->getGraphicsAllocation()->getGpuAddress() + heap->getUsed() - iohRequiredSize; - ptr = NEO::ImplicitArgsHelper::patchImplicitArgs(ptr, *pImplicitArgs, kernelDescriptor, std::make_pair(localIdsGenerationByRuntime, requiredWorkgroupOrder)); + ptr = NEO::ImplicitArgsHelper::patchImplicitArgs(ptr, *pImplicitArgs, kernelDescriptor, std::make_pair(localIdsGenerationByRuntime, requiredWorkgroupOrder), gfxCoreHelper); } if (sizeCrossThreadData > 0) { diff --git a/shared/source/helpers/aarch64/local_id_gen.cpp b/shared/source/helpers/aarch64/local_id_gen.cpp index 8a3cd04ea7..4169785bf7 100644 --- a/shared/source/helpers/aarch64/local_id_gen.cpp +++ b/shared/source/helpers/aarch64/local_id_gen.cpp @@ -8,6 +8,7 @@ #include "shared/source/helpers/local_id_gen.h" #include "shared/source/helpers/aligned_memory.h" +#include "shared/source/helpers/gfx_core_helper.h" #include "shared/source/helpers/local_id_gen_special.inl" #include "shared/source/utilities/cpu_info.h" @@ -41,8 +42,9 @@ LocalIDHelper::LocalIDHelper() { LocalIDHelper LocalIDHelper::initializer; -void generateLocalIDs(void *buffer, uint16_t simd, const std::array &localWorkgroupSize, const std::array &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize) { - auto threadsPerWorkGroup = static_cast(getThreadsPerWG(simd, static_cast(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]))); +void generateLocalIDs(void *buffer, uint16_t simd, const std::array &localWorkgroupSize, const std::array &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize, const GfxCoreHelper &gfxCoreHelper) { + bool localIdsGeneratedByHw = false; + auto threadsPerWorkGroup = static_cast(gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]), grfSize, localIdsGeneratedByHw)); bool useLayoutForImages = isImageOnlyKernel && isCompatibleWithLayoutForImages(localWorkgroupSize, dimensionsOrder, simd); if (useLayoutForImages) { generateLocalIDsWithLayoutForImages(buffer, localWorkgroupSize, simd); diff --git a/shared/source/helpers/local_id_gen.h b/shared/source/helpers/local_id_gen.h index 94fb33dd5e..ab88965571 100644 --- a/shared/source/helpers/local_id_gen.h +++ b/shared/source/helpers/local_id_gen.h @@ -12,6 +12,7 @@ #include namespace NEO { +class GfxCoreHelper; inline uint32_t getGRFsPerThread(uint32_t simd, uint32_t grfSize) { return (simd == 32 && grfSize == 32) ? 2 : 1; } @@ -58,7 +59,7 @@ void generateLocalIDsSimd(void *b, const std::array &localWorkgroup const std::array &dimensionsOrder, bool chooseMaxRowSize); void generateLocalIDs(void *buffer, uint16_t simd, const std::array &localWorkgroupSize, - const std::array &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize); + const std::array &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize, const NEO::GfxCoreHelper &gfxCoreHelper); void generateLocalIDsWithLayoutForImages(void *b, const std::array &localWorkgroupSize, uint16_t simd); bool isCompatibleWithLayoutForImages(const std::array &localWorkgroupSize, const std::array &dimensionsOrder, uint16_t simd); diff --git a/shared/source/helpers/x86_64/local_id_gen.cpp b/shared/source/helpers/x86_64/local_id_gen.cpp index 18c464284f..3c7a9def07 100644 --- a/shared/source/helpers/x86_64/local_id_gen.cpp +++ b/shared/source/helpers/x86_64/local_id_gen.cpp @@ -8,6 +8,7 @@ #include "shared/source/helpers/local_id_gen.h" #include "shared/source/helpers/aligned_memory.h" +#include "shared/source/helpers/gfx_core_helper.h" #include "shared/source/helpers/local_id_gen_special.inl" #include "shared/source/utilities/cpu_info.h" @@ -44,8 +45,9 @@ LocalIDHelper::LocalIDHelper() { LocalIDHelper LocalIDHelper::initializer; // traditional function to generate local IDs -void generateLocalIDs(void *buffer, uint16_t simd, const std::array &localWorkgroupSize, const std::array &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize) { - auto threadsPerWorkGroup = static_cast(getThreadsPerWG(simd, static_cast(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]))); +void generateLocalIDs(void *buffer, uint16_t simd, const std::array &localWorkgroupSize, const std::array &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize, const GfxCoreHelper &gfxCoreHelper) { + bool localIdsGeneratedByHw = false; + auto threadsPerWorkGroup = static_cast(gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]), grfSize, localIdsGeneratedByHw)); bool useLayoutForImages = isImageOnlyKernel && isCompatibleWithLayoutForImages(localWorkgroupSize, dimensionsOrder, simd); if (useLayoutForImages) { generateLocalIDsWithLayoutForImages(buffer, localWorkgroupSize, simd); diff --git a/shared/source/kernel/implicit_args.h b/shared/source/kernel/implicit_args.h index 1eb6090cc7..9046809586 100644 --- a/shared/source/kernel/implicit_args.h +++ b/shared/source/kernel/implicit_args.h @@ -15,6 +15,7 @@ namespace NEO { struct KernelDescriptor; +class GfxCoreHelper; struct ImplicitArgs { uint8_t structSize; @@ -51,6 +52,6 @@ namespace ImplicitArgsHelper { std::array getDimensionOrderForLocalIds(const uint8_t *workgroupDimensionsOrder, std::optional> hwGenerationOfLocalIdsParams); uint32_t getGrfSize(uint32_t simd); uint32_t getSizeForImplicitArgsPatching(const ImplicitArgs *pImplicitArgs, const KernelDescriptor &kernelDescriptor); -void *patchImplicitArgs(void *ptrToPatch, const ImplicitArgs &implicitArgs, const KernelDescriptor &kernelDescriptor, std::optional> hwGenerationOfLocalIdsParams); +void *patchImplicitArgs(void *ptrToPatch, const ImplicitArgs &implicitArgs, const KernelDescriptor &kernelDescriptor, std::optional> hwGenerationOfLocalIdsParams, const GfxCoreHelper &gfxCoreHelper); } // namespace ImplicitArgsHelper } // namespace NEO diff --git a/shared/source/kernel/implicit_args_helper.cpp b/shared/source/kernel/implicit_args_helper.cpp index 24f783a705..7b0b7ecdc8 100644 --- a/shared/source/kernel/implicit_args_helper.cpp +++ b/shared/source/kernel/implicit_args_helper.cpp @@ -62,7 +62,7 @@ uint32_t getSizeForImplicitArgsPatching(const ImplicitArgs *pImplicitArgs, const } } -void *patchImplicitArgs(void *ptrToPatch, const ImplicitArgs &implicitArgs, const KernelDescriptor &kernelDescriptor, std::optional> hwGenerationOfLocalIdsParams) { +void *patchImplicitArgs(void *ptrToPatch, const ImplicitArgs &implicitArgs, const KernelDescriptor &kernelDescriptor, std::optional> hwGenerationOfLocalIdsParams, const GfxCoreHelper &gfxCoreHelper) { auto totalSizeToProgram = getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor); auto retVal = ptrOffset(ptrToPatch, totalSizeToProgram); @@ -80,7 +80,7 @@ void *patchImplicitArgs(void *ptrToPatch, const ImplicitArgs &implicitArgs, cons static_cast(implicitArgs.localSizeY), static_cast(implicitArgs.localSizeZ)}}, dimensionOrder, - false, grfSize); + false, grfSize, gfxCoreHelper); auto sizeForLocalIdsProgramming = totalSizeToProgram - sizeof(NEO::ImplicitArgs); ptrToPatch = ptrOffset(ptrToPatch, sizeForLocalIdsProgramming); } diff --git a/shared/source/kernel/local_ids_cache.cpp b/shared/source/kernel/local_ids_cache.cpp index 6fb149297a..f678b72fbb 100644 --- a/shared/source/kernel/local_ids_cache.cpp +++ b/shared/source/kernel/local_ids_cache.cpp @@ -47,7 +47,7 @@ void LocalIdsCache::setLocalIdsForEntry(LocalIdsCacheEntry &entry, void *destina std::memcpy(destination, entry.localIdsData, entry.localIdsSize); } -void LocalIdsCache::setLocalIdsForGroup(const Vec3 &group, void *destination) { +void LocalIdsCache::setLocalIdsForGroup(const Vec3 &group, void *destination, const GfxCoreHelper &gfxCoreHelper) { auto setLocalIdsLock = lock(); LocalIdsCacheEntry *leastAccessedEntry = &cache[0]; for (auto &cacheEntry : cache) { @@ -60,11 +60,11 @@ void LocalIdsCache::setLocalIdsForGroup(const Vec3 &group, void *desti } } - commitNewEntry(*leastAccessedEntry, group); + commitNewEntry(*leastAccessedEntry, group, gfxCoreHelper); setLocalIdsForEntry(*leastAccessedEntry, destination); } -void LocalIdsCache::commitNewEntry(LocalIdsCacheEntry &entry, const Vec3 &group) { +void LocalIdsCache::commitNewEntry(LocalIdsCacheEntry &entry, const Vec3 &group, const GfxCoreHelper &gfxCoreHelper) { entry.localIdsSize = getLocalIdsSizeForGroup(group); entry.groupSize = group; entry.accessCounter = 0U; @@ -74,7 +74,7 @@ void LocalIdsCache::commitNewEntry(LocalIdsCacheEntry &entry, const Vec3(simdSize), - {group[0], group[1], group[2]}, wgDimOrder, usesOnlyImages, grfSize); + {group[0], group[1], group[2]}, wgDimOrder, usesOnlyImages, grfSize, gfxCoreHelper); } } // namespace NEO \ No newline at end of file diff --git a/shared/source/kernel/local_ids_cache.h b/shared/source/kernel/local_ids_cache.h index f7fb37d262..26955ed6b0 100644 --- a/shared/source/kernel/local_ids_cache.h +++ b/shared/source/kernel/local_ids_cache.h @@ -12,7 +12,7 @@ #include namespace NEO { - +class GfxCoreHelper; class LocalIdsCache { public: struct LocalIdsCacheEntry { @@ -30,13 +30,13 @@ class LocalIdsCache { LocalIdsCache(size_t cacheSize, std::array wgDimOrder, uint8_t simdSize, uint8_t grfSize, bool usesOnlyImages = false); ~LocalIdsCache(); - void setLocalIdsForGroup(const Vec3 &group, void *destination); + void setLocalIdsForGroup(const Vec3 &group, void *destination, const GfxCoreHelper &gfxCoreHelper); size_t getLocalIdsSizeForGroup(const Vec3 &group) const; size_t getLocalIdsSizePerThread() const; protected: void setLocalIdsForEntry(LocalIdsCacheEntry &entry, void *destination); - void commitNewEntry(LocalIdsCacheEntry &entry, const Vec3 &group); + void commitNewEntry(LocalIdsCacheEntry &entry, const Vec3 &group, const GfxCoreHelper &gfxCoreHelper); std::unique_lock lock(); StackVec cache; diff --git a/shared/test/unit_test/helpers/local_id_tests.cpp b/shared/test/unit_test/helpers/local_id_tests.cpp index 4cbf58c52e..661277ce9b 100644 --- a/shared/test/unit_test/helpers/local_id_tests.cpp +++ b/shared/test/unit_test/helpers/local_id_tests.cpp @@ -7,8 +7,10 @@ #include "shared/source/helpers/aligned_memory.h" #include "shared/source/helpers/basic_math.h" +#include "shared/source/helpers/gfx_core_helper.h" #include "shared/source/helpers/local_id_gen.h" #include "shared/source/helpers/ptr_math.h" +#include "shared/test/common/helpers/default_hw_info.h" #include "shared/test/common/helpers/unit_test_helper.h" #include "shared/test/common/test_macros/hw_test.h" @@ -75,14 +77,16 @@ TEST(LocalID, GivenSimd1WhenGettingPerThreadSizeLocalIdsThenValueIsEqualGrfSize) EXPECT_EQ(grfSize, getPerThreadSizeLocalIDs(simd, grfSize)); } -TEST(LocalID, givenVariadicGrfSizeWhenLocalSizesAreEmittedThenUseFullRowSize) { + +TEST(LocalIdTest, givenVariadicGrfSizeWhenLocalSizesAreEmittedThenUseFullRowSize) { auto localIdsPtr = allocateAlignedMemory(3 * 64u, MemoryConstants::cacheLineSize); uint16_t *localIdsView = reinterpret_cast(localIdsPtr.get()); std::array localSizes = {{2u, 2u, 1u}}; std::array dimensionsOrder = {{0u, 1u, 2u}}; - generateLocalIDs(localIdsPtr.get(), 16u, localSizes, dimensionsOrder, false, 64u); + auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily); + generateLocalIDs(localIdsPtr.get(), 16u, localSizes, dimensionsOrder, false, 64u, *gfxCoreHelper.get()); EXPECT_EQ(localIdsView[0], 0u); EXPECT_EQ(localIdsView[1], 1u); EXPECT_EQ(localIdsView[2], 0u); @@ -277,37 +281,42 @@ struct LocalIDFixture : ::testing::TestWithParamplatform.eRenderCoreFamily); generateLocalIDs(buffer, simd, std::array{{static_cast(localWorkSizeX), static_cast(localWorkSizeY), static_cast(localWorkSizeZ)}}, - std::array{{0, 1, 2}}, false, grfSize); + std::array{{0, 1, 2}}, false, grfSize, *gfxCoreHelper.get()); validateIDWithinLimits(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, UnitTestHelper::useFullRowForLocalIdsGeneration); } HWTEST_P(LocalIDFixture, WhenGeneratingLocalIdsThenAllWorkItemsCovered) { + auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily); generateLocalIDs(buffer, simd, std::array{{static_cast(localWorkSizeX), static_cast(localWorkSizeY), static_cast(localWorkSizeZ)}}, - std::array{{0, 1, 2}}, false, grfSize); + std::array{{0, 1, 2}}, false, grfSize, *gfxCoreHelper.get()); validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, UnitTestHelper::useFullRowForLocalIdsGeneration); } HWTEST_P(LocalIDFixture, WhenWalkOrderIsXyzThenProperLocalIdsAreGenerated) { auto dimensionsOrder = std::array{{0, 1, 2}}; + auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily); generateLocalIDs(buffer, simd, std::array{{static_cast(localWorkSizeX), static_cast(localWorkSizeY), static_cast(localWorkSizeZ)}}, - dimensionsOrder, false, grfSize); + dimensionsOrder, false, grfSize, *gfxCoreHelper.get()); validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, UnitTestHelper::useFullRowForLocalIdsGeneration); validateWalkOrder(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, dimensionsOrder); } HWTEST_P(LocalIDFixture, WhenWalkOrderIsYxzThenProperLocalIdsAreGenerated) { auto dimensionsOrder = std::array{{1, 0, 2}}; + auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily); generateLocalIDs(buffer, simd, std::array{{static_cast(localWorkSizeX), static_cast(localWorkSizeY), static_cast(localWorkSizeZ)}}, - dimensionsOrder, false, grfSize); + dimensionsOrder, false, grfSize, *gfxCoreHelper.get()); validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, UnitTestHelper::useFullRowForLocalIdsGeneration); validateWalkOrder(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, dimensionsOrder); } HWTEST_P(LocalIDFixture, WhenWalkOrderIsZyxThenProperLocalIdsAreGenerated) { auto dimensionsOrder = std::array{{2, 1, 0}}; + auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily); generateLocalIDs(buffer, simd, std::array{{static_cast(localWorkSizeX), static_cast(localWorkSizeY), static_cast(localWorkSizeZ)}}, - dimensionsOrder, false, grfSize); + dimensionsOrder, false, grfSize, *gfxCoreHelper.get()); validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, UnitTestHelper::useFullRowForLocalIdsGeneration); validateWalkOrder(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, dimensionsOrder); } @@ -335,8 +344,8 @@ struct LocalIdsLayoutForImagesTest : ::testing::TestWithParam(memory.get()); EXPECT_TRUE(isCompatibleWithLayoutForImages(localWorkSize, dimensionsOrder, simd)); - generateLocalIDs(buffer, simd, localWorkSize, dimensionsOrder, true, grfSize); + auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily); + generateLocalIDs(buffer, simd, localWorkSize, dimensionsOrder, true, grfSize, *gfxCoreHelper.get()); } void validateGRF() { uint32_t totalLocalIds = localWorkSize.at(0) * localWorkSize.at(1); @@ -447,9 +457,9 @@ TEST_P(LocalIdsLayoutTest, givenLocalWorkgroupSize4x4x1WhenGenerateLocalIdsThenH auto alignedMemory2 = allocateAlignedMemory(size, 32); auto buffer2 = reinterpret_cast(alignedMemory2.get()); memset(buffer2, 0xff, size); - - generateLocalIDs(buffer1, simd, localWorkSize, dimensionsOrder, false, grfSize); - generateLocalIDs(buffer2, simd, localWorkSize, dimensionsOrder, true, grfSize); + auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily); + generateLocalIDs(buffer1, simd, localWorkSize, dimensionsOrder, false, grfSize, *gfxCoreHelper.get()); + generateLocalIDs(buffer2, simd, localWorkSize, dimensionsOrder, true, grfSize, *gfxCoreHelper.get()); for (auto i = 0u; i < elemsInBuffer / rowWidth; i++) { for (auto j = 0u; j < rowWidth; j++) { diff --git a/shared/test/unit_test/kernel/implicit_args_helper_tests.cpp b/shared/test/unit_test/kernel/implicit_args_helper_tests.cpp index aefe5d7590..e5401b893d 100644 --- a/shared/test/unit_test/kernel/implicit_args_helper_tests.cpp +++ b/shared/test/unit_test/kernel/implicit_args_helper_tests.cpp @@ -6,11 +6,13 @@ */ #include "shared/source/helpers/aligned_memory.h" +#include "shared/source/helpers/gfx_core_helper.h" #include "shared/source/helpers/hw_walk_order.h" #include "shared/source/helpers/per_thread_data.h" #include "shared/source/helpers/ptr_math.h" #include "shared/source/kernel/implicit_args.h" #include "shared/source/kernel/kernel_descriptor.h" +#include "shared/test/common/helpers/default_hw_info.h" #include "shared/test/common/test_macros/hw_test.h" using namespace NEO; @@ -117,8 +119,8 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInP uint8_t pattern = 0xcd; memset(memoryToPatch.get(), pattern, totalSizeForPatching); - - auto retVal = ImplicitArgsHelper::patchImplicitArgs(memoryToPatch.get(), implicitArgs, kernelDescriptor, {}); + auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily); + auto retVal = ImplicitArgsHelper::patchImplicitArgs(memoryToPatch.get(), implicitArgs, kernelDescriptor, {}, *gfxCoreHelper.get()); EXPECT_EQ(retVal, ptrOffset(memoryToPatch.get(), totalSizeForPatching)); @@ -158,8 +160,8 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithImplicitArgsBufferOffsetInPayl uint8_t pattern = 0xcd; memset(memoryToPatch.get(), pattern, totalSizeForPatching); - - auto retVal = ImplicitArgsHelper::patchImplicitArgs(memoryToPatch.get(), implicitArgs, kernelDescriptor, {}); + auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily); + auto retVal = ImplicitArgsHelper::patchImplicitArgs(memoryToPatch.get(), implicitArgs, kernelDescriptor, {}, *gfxCoreHelper.get()); EXPECT_EQ(retVal, ptrOffset(memoryToPatch.get(), totalSizeForPatching)); diff --git a/shared/test/unit_test/kernel/local_ids_cache_tests.cpp b/shared/test/unit_test/kernel/local_ids_cache_tests.cpp index 89f585ddb8..602832f91b 100644 --- a/shared/test/unit_test/kernel/local_ids_cache_tests.cpp +++ b/shared/test/unit_test/kernel/local_ids_cache_tests.cpp @@ -7,8 +7,11 @@ #include "shared/source/command_stream/linear_stream.h" #include "shared/source/helpers/aligned_memory.h" +#include "shared/source/helpers/gfx_core_helper.h" +#include "shared/source/helpers/hw_info.h" #include "shared/source/helpers/per_thread_data.h" #include "shared/source/kernel/local_ids_cache.h" +#include "shared/test/common/helpers/default_hw_info.h" #include "shared/test/common/mocks/mock_graphics_allocation.h" #include "shared/test/common/test_macros/test.h" @@ -35,7 +38,8 @@ using LocalIdsCacheTest = Test; TEST_F(LocalIdsCacheTest, GivenCacheMissWhenGetLocalIdsForGroupThenNewEntryIsCommitedIntoLeastUsedEntry) { localIdsCache->cache.resize(2); localIdsCache->cache[0].accessCounter = 2U; - localIdsCache->setLocalIdsForGroup(groupSize, perThreadData.data()); + auto gfxCoreHelper = NEO::GfxCoreHelper::create(NEO::defaultHwInfo->platform.eRenderCoreFamily); + localIdsCache->setLocalIdsForGroup(groupSize, perThreadData.data(), *gfxCoreHelper.get()); EXPECT_EQ(groupSize, localIdsCache->cache[1].groupSize); EXPECT_NE(nullptr, localIdsCache->cache[1].localIdsData); @@ -50,7 +54,8 @@ TEST_F(LocalIdsCacheTest, GivenEntryInCacheWhenGetLocalIdsForGroupThenEntryFromC localIdsCache->cache[0].localIdsSize = 512U; localIdsCache->cache[0].localIdsSizeAllocated = 512U; localIdsCache->cache[0].accessCounter = 1U; - localIdsCache->setLocalIdsForGroup(groupSize, perThreadData.data()); + auto gfxCoreHelper = NEO::GfxCoreHelper::create(NEO::defaultHwInfo->platform.eRenderCoreFamily); + localIdsCache->setLocalIdsForGroup(groupSize, perThreadData.data(), *gfxCoreHelper.get()); EXPECT_EQ(2U, localIdsCache->cache[0].accessCounter); } @@ -63,7 +68,8 @@ TEST_F(LocalIdsCacheTest, GivenEntryWithBiggerBufferAllocatedWhenGetLocalIdsForG const auto localIdsData = localIdsCache->cache[0].localIdsData; groupSize = {2, 1, 1}; - localIdsCache->setLocalIdsForGroup(groupSize, perThreadData.data()); + auto gfxCoreHelper = NEO::GfxCoreHelper::create(NEO::defaultHwInfo->platform.eRenderCoreFamily); + localIdsCache->setLocalIdsForGroup(groupSize, perThreadData.data(), *gfxCoreHelper.get()); EXPECT_EQ(1U, localIdsCache->cache[0].accessCounter); EXPECT_EQ(192U, localIdsCache->cache[0].localIdsSize); EXPECT_EQ(512U, localIdsCache->cache[0].localIdsSizeAllocated);