diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp index b6cfca5360..f8ffc878c5 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp @@ -1000,7 +1000,7 @@ struct CmdlistAppendLaunchKernelWithImplicitArgsTests : CmdlistAppendLaunchKerne implicitArgsProgrammingSize = ImplicitArgsHelper::getSizeForImplicitArgsPatching(&expectedImplicitArgs, *kernelDescriptor, !kernelRequiresGenerationOfLocalIdsByRuntime, gfxCoreHelper); auto sizeCrossThreadData = kernel->getCrossThreadDataSize(); auto sizePerThreadDataForWholeGroup = kernel->getPerThreadDataSizeForWholeThreadGroup(); - EXPECT_EQ(indirectHeap->getUsed(), sizeCrossThreadData + sizePerThreadDataForWholeGroup + implicitArgsProgrammingSize); + EXPECT_EQ(indirectHeap->getUsed(), alignUp(sizeCrossThreadData + sizePerThreadDataForWholeGroup + implicitArgsProgrammingSize, gfxCoreHelper.getIOHAlignment())); if (FamilyType::supportsCmdSet(IGFX_XE_HP_CORE)) { expectedImplicitArgs.localIdTablePtr = indirectHeapAllocation->getGpuAddress(); diff --git a/opencl/source/helpers/hardware_commands_helper_base.inl b/opencl/source/helpers/hardware_commands_helper_base.inl index 91695a7232..654e3670c6 100644 --- a/opencl/source/helpers/hardware_commands_helper_base.inl +++ b/opencl/source/helpers/hardware_commands_helper_base.inl @@ -1,5 +1,5 @@ /* - * Copyright (C) 2019-2023 Intel Corporation + * Copyright (C) 2019-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -327,6 +327,8 @@ size_t HardwareCommandsHelper::sendIndirectState( auto indirectDataLength = alignUp(static_cast(sizeCrossThreadData + sizePerThreadDataTotal), WalkerType::INDIRECTDATASTARTADDRESS_ALIGN_SIZE); walkerCmd->setIndirectDataLength(indirectDataLength); + + ioh.align(kernel.getGfxCoreHelper().getIOHAlignment()); } return offsetCrossThreadData; diff --git a/shared/source/command_container/command_encoder_xehp_and_later.inl b/shared/source/command_container/command_encoder_xehp_and_later.inl index be51cb25b8..b4f29c034b 100644 --- a/shared/source/command_container/command_encoder_xehp_and_later.inl +++ b/shared/source/command_container/command_encoder_xehp_and_later.inl @@ -343,6 +343,8 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis } else { walkerCmd.setIndirectDataStartAddress(static_cast(offsetThreadData)); walkerCmd.setIndirectDataLength(sizeThreadData); + + container.getIndirectHeap(HeapType::indirectObject)->align(rootDeviceEnvironment.getHelper().getIOHAlignment()); } EncodeDispatchKernel::encodeThreadData(walkerCmd, diff --git a/shared/source/debug_settings/debug_variables_base.inl b/shared/source/debug_settings/debug_variables_base.inl index 04d3188b3c..dff57a0a9a 100644 --- a/shared/source/debug_settings/debug_variables_base.inl +++ b/shared/source/debug_settings/debug_variables_base.inl @@ -574,6 +574,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, ForceSemaphoreDelayBetweenWaits, -1, "Specifies DECLARE_DEBUG_VARIABLE(int32_t, ForceLocalMemoryAccessMode, -1, "-1: don't override, 0: default rules apply, 1: CPU can access local memory, 3: CPU never accesses local memory") DECLARE_DEBUG_VARIABLE(int32_t, ForceUserptrAlignment, -1, "-1: no force (4kb), >0: n kb alignment") DECLARE_DEBUG_VARIABLE(int32_t, ForceCommandBufferAlignment, -1, "-1: no force (64kb), >0: n kb alignment") +DECLARE_DEBUG_VARIABLE(int32_t, ForceIOHAlignment, -1, "-1: no force, >0: n byte alignment") DECLARE_DEBUG_VARIABLE(int32_t, ForceDefaultHeapSize, -1, "-1: no force (64kb), >0: n kb size") DECLARE_DEBUG_VARIABLE(int32_t, PreferCopyEngineForCopyBufferToBuffer, -1, "-1: default, 0: prefer EUs, 1: prefer blitter") DECLARE_DEBUG_VARIABLE(int64_t, ForceSystemMemoryPlacement, 0, "0: default, >0: (bitmask) for given Graphics Allocation Type, force system memory placement") diff --git a/shared/source/helpers/gfx_core_helper.h b/shared/source/helpers/gfx_core_helper.h index 91ef522d64..af98672ef2 100644 --- a/shared/source/helpers/gfx_core_helper.h +++ b/shared/source/helpers/gfx_core_helper.h @@ -159,6 +159,7 @@ class GfxCoreHelper { virtual size_t getMax3dImageWidthOrHeight() const = 0; virtual uint64_t getMaxMemAllocSize() const = 0; virtual uint64_t getPatIndex(CacheRegion cacheRegion, CachePolicy cachePolicy) const = 0; + virtual size_t getIOHAlignment() const = 0; virtual bool isStatelessToStatefulWithOffsetSupported() const = 0; virtual void encodeBufferSurfaceState(EncodeSurfaceStateArgs &args) const = 0; virtual bool platformSupportsImplicitScaling(const NEO::RootDeviceEnvironment &rootDeviceEnvironment) const = 0; @@ -383,6 +384,7 @@ class GfxCoreHelperHw : public GfxCoreHelper { size_t getMax3dImageWidthOrHeight() const override; uint64_t getMaxMemAllocSize() const override; uint64_t getPatIndex(CacheRegion cacheRegion, CachePolicy cachePolicy) const override; + size_t getIOHAlignment() const override; bool isStatelessToStatefulWithOffsetSupported() const override; void encodeBufferSurfaceState(EncodeSurfaceStateArgs &args) const override; bool platformSupportsImplicitScaling(const NEO::RootDeviceEnvironment &rootDeviceEnvironment) const override; diff --git a/shared/source/helpers/gfx_core_helper_base.inl b/shared/source/helpers/gfx_core_helper_base.inl index b8f0f316d0..6f094b8659 100644 --- a/shared/source/helpers/gfx_core_helper_base.inl +++ b/shared/source/helpers/gfx_core_helper_base.inl @@ -668,6 +668,11 @@ uint64_t GfxCoreHelperHw::getPatIndex(CacheRegion cacheRegion, CacheP return -1; } +template +size_t GfxCoreHelperHw::getIOHAlignment() const { + return 1; +} + template bool GfxCoreHelperHw::copyThroughLockedPtrEnabled(const HardwareInfo &hwInfo, const ProductHelper &productHelper) const { if (debugManager.flags.ExperimentalCopyThroughLock.get() != -1) { diff --git a/shared/test/common/test_files/igdrcl.config b/shared/test/common/test_files/igdrcl.config index 7ba8caa628..83b26f84a1 100644 --- a/shared/test/common/test_files/igdrcl.config +++ b/shared/test/common/test_files/igdrcl.config @@ -279,6 +279,7 @@ OverrideSlmSize = -1 UseCyclesPerSecondTimer = 0 PrintOsContextInitializations = 0 WaitLoopCount = -1 +ForceIOHAlignment = -1 DebuggerLogBitmask = 0 GTPinAllocateBufferInSharedMemory = -1 DeferOsContextInitialization = -1 diff --git a/shared/test/unit_test/encoders/test_encode_dispatch_kernel_xehp_and_later.cpp b/shared/test/unit_test/encoders/test_encode_dispatch_kernel_xehp_and_later.cpp index d6f4ea53b7..66d334d50b 100644 --- a/shared/test/unit_test/encoders/test_encode_dispatch_kernel_xehp_and_later.cpp +++ b/shared/test/unit_test/encoders/test_encode_dispatch_kernel_xehp_and_later.cpp @@ -592,9 +592,10 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesTest, givenInlineDataRequiredWhe EXPECT_EQ(1u, cmd->getEmitInlineParameter()); const uint32_t inlineDataSize = sizeof(InlineData); - size_t expectedSizeIOH = dispatchInterface->getCrossThreadDataSize() + - dispatchInterface->getPerThreadDataSizeForWholeThreadGroup() - - inlineDataSize; + size_t expectedSizeIOH = alignUp(dispatchInterface->getCrossThreadDataSize() + + dispatchInterface->getPerThreadDataSizeForWholeThreadGroup() - + inlineDataSize, + this->getHelper().getIOHAlignment()); auto heap = cmdContainer->getIndirectHeap(HeapType::indirectObject); EXPECT_EQ(expectedSizeIOH, heap->getUsed()); } @@ -619,8 +620,9 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesTest, givenInlineDataRequiredIsF auto cmd = genCmdCast(*itor); EXPECT_EQ(0u, cmd->getEmitInlineParameter()); - size_t expectedSizeIOH = dispatchInterface->getCrossThreadDataSize() + - dispatchInterface->getPerThreadDataSizeForWholeThreadGroup(); + size_t expectedSizeIOH = alignUp(dispatchInterface->getCrossThreadDataSize() + + dispatchInterface->getPerThreadDataSizeForWholeThreadGroup(), + this->getHelper().getIOHAlignment()); auto heap = cmdContainer->getIndirectHeap(HeapType::indirectObject); EXPECT_EQ(expectedSizeIOH, heap->getUsed()); }