From 60495a5b201e7f1f91a6e835de1433dc1ca3be14 Mon Sep 17 00:00:00 2001 From: Mateusz Hoppe Date: Tue, 14 Jan 2025 13:51:12 +0000 Subject: [PATCH] performance: optimize memory used for scratch programming - reserve exact number of slots for scratch surface states in surface state heaps - do not use offsets for contexts depending on engine type executing cmdlists Signed-off-by: Mateusz Hoppe --- .../core/source/cmdlist/cmdlist_hw_xehp_and_later.inl | 8 +++++++- .../source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl | 9 ++------- .../sources/cmdlist/test_cmdlist_xehp_and_later.cpp | 4 ++-- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl index 8d32f32ab6..ee82e03aa0 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl @@ -36,7 +36,13 @@ namespace L0 { template size_t CommandListCoreFamily::getReserveSshSize() { - return 4 * MemoryConstants::pageSize; + constexpr size_t maxPtssSteps = 16; + constexpr size_t numSlotsPerStep = 2; + constexpr size_t numSteps = 2; + constexpr size_t startSlotIndex = 1; + + using RENDER_SURFACE_STATE = typename GfxFamily::RENDER_SURFACE_STATE; + return (maxPtssSteps * numSlotsPerStep + startSlotIndex) * numSteps * sizeof(RENDER_SURFACE_STATE); } template diff --git a/level_zero/core/source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl b/level_zero/core/source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl index f5fd0b12b7..e167d47e38 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl +++ b/level_zero/core/source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl @@ -1,5 +1,5 @@ /* - * Copyright (C) 2021-2024 Intel Corporation + * Copyright (C) 2021-2025 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -142,13 +142,8 @@ void CommandQueueHw::handleScratchSpace(NEO::HeapContainer &sshHe csr->getOsContext(), gsbaState, frontEndState); } - NEO::Device *neoDevice = device->getNEODevice(); - auto &gfxCoreHelper = neoDevice->getGfxCoreHelper(); - auto &productHelper = neoDevice->getProductHelper(); - if (sshHeaps.size() > 0) { - uint32_t offsetIndex = gfxCoreHelper.getMaxPtssIndex(productHelper) * csr->getOsContext().getEngineType() + 1u; - scratchController->programHeaps(sshHeaps, offsetIndex, perThreadScratchSpaceSlot0Size, perThreadScratchSpaceSlot1Size, + scratchController->programHeaps(sshHeaps, 1u, perThreadScratchSpaceSlot0Size, perThreadScratchSpaceSlot1Size, csr->getOsContext(), gsbaState, frontEndState); } diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_xehp_and_later.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_xehp_and_later.cpp index 1bd695c385..210c3b37fc 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_xehp_and_later.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_xehp_and_later.cpp @@ -204,11 +204,11 @@ HWTEST2_F(CommandListTests, whenCommandListIsCreatedAndProgramExtendedPipeContro } using CommandListTestsReserveSize = Test; -HWTEST2_F(CommandListTestsReserveSize, givenCommandListWhenGetReserveSshSizeThen4PagesReturned, IsAtLeastXeHpCore) { +HWTEST2_F(CommandListTestsReserveSize, givenCommandListWhenGetReserveSshSizeThen16slotSpaceReturned, IsAtLeastXeHpCore) { L0::CommandListCoreFamily commandList(1u); commandList.initialize(device, NEO::EngineGroupType::compute, 0u); - EXPECT_EQ(commandList.getReserveSshSize(), 4 * MemoryConstants::pageSize); + EXPECT_EQ(commandList.getReserveSshSize(), (16 * 2 + 1) * 2 * sizeof(typename FamilyType::RENDER_SURFACE_STATE)); } using CommandListAppendLaunchKernel = Test;