fix: correct alignment of per thread scratch size

Related-To: NEO-5288

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz 2023-07-11 15:55:01 +00:00 committed by Compute-Runtime-Automation
parent 40af0dddeb
commit 1c0285a156
5 changed files with 47 additions and 37 deletions

View File

@ -468,7 +468,7 @@ HWTEST_P(EnqueueKernelWithScratch, GivenKernelRequiringScratchWhenItIsEnqueuedWi
EXPECT_TRUE(mockCsr->isMadeResident(graphicsAllocation));
// Enqueue With ScratchSize bigger than previous
scratchSize = 8196;
scratchSize = 8192;
mockKernel.kernelInfo.setPerThreadScratchSize(scratchSize, 0);
enqueueKernel<FamilyType, false>(mockKernel);

View File

@ -950,7 +950,7 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, GivenForced32BitAllocationsModeSto
auto scratchAllocation = commandStreamReceiver->getScratchAllocation();
ASSERT_NE(scratchAllocation, nullptr);
commandStreamReceiver->setRequiredScratchSizes(8196, 0); // whatever > first size
commandStreamReceiver->setRequiredScratchSizes(8192, 0); // whatever > first size
flushTask(*commandStreamReceiver); // 2nd flush
@ -984,7 +984,7 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, GivenForced32BitAllocationsModeSto
auto scratchAllocation = commandStreamReceiver->getScratchAllocation();
ASSERT_NE(scratchAllocation, nullptr);
commandStreamReceiver->setRequiredScratchSizes(8196, 0); // whatever > first size
commandStreamReceiver->setRequiredScratchSizes(8192, 0); // whatever > first size
flushTask(*commandStreamReceiver); // 2nd flush

View File

@ -215,19 +215,6 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverHwTestXeHPAndLater, givenScrat
EXPECT_FALSE(cfeStateDirty);
}
HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverHwTestXeHPAndLater, givenScratchSpaceSurfaceStateEnabledWhenRequiredScratchSpaceIsSetThenPerThreadScratchSizeIsAlignedTo64) {
auto commandStreamReceiver = std::make_unique<MockCsrHw<FamilyType>>(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield());
auto scratchController = static_cast<MockScratchSpaceControllerXeHPAndLater *>(commandStreamReceiver->getScratchSpaceController());
uint32_t perThreadScratchSize = 1;
uint32_t expectedValue = 1 << 6;
bool stateBaseAddressDirty = false;
bool cfeStateDirty = false;
uint8_t surfaceHeap[1000];
scratchController->setRequiredScratchSpace(surfaceHeap, 0u, perThreadScratchSize, 0u, commandStreamReceiver->taskCount, *pDevice->getDefaultEngine().osContext, stateBaseAddressDirty, cfeStateDirty);
EXPECT_EQ(expectedValue, scratchController->perThreadScratchSize);
}
HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverHwTestXeHPAndLater, givenScratchSpaceSurfaceStateEnabledWhenNewSshProvidedAndScratchAllocationExistsThenSetDirtyBitCopyCurrentState) {
using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE;
auto commandStreamReceiver = std::make_unique<MockCsrHw<FamilyType>>(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield());
@ -584,25 +571,6 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverHwTestXeHPAndLater, givenScrat
EXPECT_EQ(32u, scratchController->stateSlotsCount);
}
HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverHwTestXeHPAndLater, givenScratchSpaceSurfaceStateEnabledWhenSizeForPrivateScratchSpaceIsMisalignedThenAlignItTo64) {
using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE;
DebugManagerStateRestore restorer;
DebugManager.flags.EnablePrivateScratchSlot1.set(1);
RENDER_SURFACE_STATE surfaceState[4];
MockCsrHw<FamilyType> commandStreamReceiver(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield());
auto scratchController = static_cast<MockScratchSpaceControllerXeHPAndLater *>(commandStreamReceiver.getScratchSpaceController());
uint32_t misalignedSizeForPrivateScratch = MemoryConstants::pageSize + 1;
bool cfeStateDirty = false;
bool stateBaseAddressDirty = false;
scratchController->setRequiredScratchSpace(surfaceState, 0u, 0u, misalignedSizeForPrivateScratch, 0u,
*pDevice->getDefaultEngine().osContext, stateBaseAddressDirty, cfeStateDirty);
EXPECT_NE(scratchController->privateScratchSizeBytes, misalignedSizeForPrivateScratch * scratchController->computeUnitsUsedForScratch);
EXPECT_EQ(scratchController->privateScratchSizeBytes, alignUp(misalignedSizeForPrivateScratch, 64) * scratchController->computeUnitsUsedForScratch);
EXPECT_EQ(scratchController->privateScratchSizeBytes, scratchController->getPrivateScratchSpaceAllocation()->getUnderlyingBufferSize());
}
HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverHwTestXeHPAndLater, givenDisabledPrivateScratchSpaceWhenSizeForPrivateScratchSpaceIsProvidedThenItIsNotCreated) {
using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE;
DebugManagerStateRestore restorer;

View File

@ -13,6 +13,7 @@
#include "shared/source/execution_environment/root_device_environment.h"
#include "shared/source/helpers/aligned_memory.h"
#include "shared/source/helpers/api_specific_config.h"
#include "shared/source/helpers/basic_math.h"
#include "shared/source/helpers/constants.h"
#include "shared/source/helpers/gfx_core_helper.h"
#include "shared/source/memory_manager/allocation_properties.h"
@ -158,7 +159,10 @@ void ScratchSpaceControllerXeHPAndLater::prepareScratchAllocation(uint32_t requi
bool &stateBaseAddressDirty,
bool &scratchSurfaceDirty,
bool &vfeStateDirty) {
uint32_t requiredPerThreadScratchSizeAlignedUp = alignUp(requiredPerThreadScratchSize, 64);
uint32_t requiredPerThreadScratchSizeAlignedUp = requiredPerThreadScratchSize;
if (!Math::isPow2(requiredPerThreadScratchSizeAlignedUp)) {
requiredPerThreadScratchSizeAlignedUp = Math::nextPowerOfTwo(requiredPerThreadScratchSize);
}
size_t requiredScratchSizeInBytes = static_cast<size_t>(requiredPerThreadScratchSizeAlignedUp) * computeUnitsUsedForScratch;
scratchSurfaceDirty = false;
auto multiTileCapable = osContext.getNumSupportedDevices() > 1;
@ -174,7 +178,10 @@ void ScratchSpaceControllerXeHPAndLater::prepareScratchAllocation(uint32_t requi
scratchAllocation = getMemoryManager()->allocateGraphicsMemoryWithProperties(properties);
}
if (privateScratchSpaceSupported) {
uint32_t requiredPerThreadPrivateScratchSizeAlignedUp = alignUp(requiredPerThreadPrivateScratchSize, 64);
uint32_t requiredPerThreadPrivateScratchSizeAlignedUp = requiredPerThreadPrivateScratchSize;
if (!Math::isPow2(requiredPerThreadPrivateScratchSizeAlignedUp)) {
requiredPerThreadPrivateScratchSizeAlignedUp = Math::nextPowerOfTwo(requiredPerThreadPrivateScratchSize);
}
size_t requiredPrivateScratchSizeInBytes = static_cast<size_t>(requiredPerThreadPrivateScratchSizeAlignedUp) * computeUnitsUsedForScratch;
if (privateScratchSizeBytes < requiredPrivateScratchSizeInBytes) {
if (privateScratchAllocation) {

View File

@ -15,6 +15,7 @@
#include "shared/source/gmm_helper/gmm_helper.h"
#include "shared/source/gmm_helper/page_table_mngr.h"
#include "shared/source/helpers/api_specific_config.h"
#include "shared/source/helpers/basic_math.h"
#include "shared/source/indirect_heap/indirect_heap.h"
#include "shared/source/memory_manager/internal_allocation_storage.h"
#include "shared/source/memory_manager/surface.h"
@ -41,6 +42,7 @@
#include "shared/test/common/mocks/mock_execution_environment.h"
#include "shared/test/common/mocks/mock_internal_allocation_storage.h"
#include "shared/test/common/mocks/mock_memory_manager.h"
#include "shared/test/common/mocks/mock_scratch_space_controller_xehp_and_later.h"
#include "shared/test/common/mocks/mock_timestamp_container.h"
#include "shared/test/common/mocks/ult_device_factory.h"
#include "shared/test/common/test_macros/hw_test.h"
@ -4174,3 +4176,36 @@ HWTEST2_F(CommandStreamReceiverHwTest,
EXPECT_TRUE(commandStreamReceiver.isMadeResident(sipAllocation));
}
HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverHwTest, givenScratchSpaceSurfaceStateEnabledWhenRequiredScratchSpaceIsSetThenPerThreadScratchSizeIsAlignedNextPow2) {
auto commandStreamReceiver = std::make_unique<MockCsrHw<FamilyType>>(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield());
auto scratchController = static_cast<MockScratchSpaceControllerXeHPAndLater *>(commandStreamReceiver->getScratchSpaceController());
uint32_t perThreadScratchSize = 65;
uint32_t expectedValue = Math::nextPowerOfTwo(perThreadScratchSize);
bool stateBaseAddressDirty = false;
bool cfeStateDirty = false;
uint8_t surfaceHeap[1000];
scratchController->setRequiredScratchSpace(surfaceHeap, 0u, perThreadScratchSize, 0u, commandStreamReceiver->taskCount, *pDevice->getDefaultEngine().osContext, stateBaseAddressDirty, cfeStateDirty);
EXPECT_EQ(expectedValue, scratchController->perThreadScratchSize);
}
HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverHwTest, givenScratchSpaceSurfaceStateEnabledWhenSizeForPrivateScratchSpaceIsMisalignedThenAlignItNextPow2) {
using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE;
DebugManagerStateRestore restorer;
DebugManager.flags.EnablePrivateScratchSlot1.set(1);
RENDER_SURFACE_STATE surfaceState[4];
MockCsrHw<FamilyType> commandStreamReceiver(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield());
auto scratchController = static_cast<MockScratchSpaceControllerXeHPAndLater *>(commandStreamReceiver.getScratchSpaceController());
uint32_t misalignedSizeForPrivateScratch = MemoryConstants::pageSize + 1;
uint32_t alignedSizeForPrivateScratch = Math::nextPowerOfTwo(misalignedSizeForPrivateScratch);
bool cfeStateDirty = false;
bool stateBaseAddressDirty = false;
scratchController->setRequiredScratchSpace(surfaceState, 0u, 0u, misalignedSizeForPrivateScratch, 0u,
*pDevice->getDefaultEngine().osContext, stateBaseAddressDirty, cfeStateDirty);
EXPECT_NE(scratchController->privateScratchSizeBytes, misalignedSizeForPrivateScratch * scratchController->computeUnitsUsedForScratch);
EXPECT_EQ(scratchController->privateScratchSizeBytes, alignedSizeForPrivateScratch * scratchController->computeUnitsUsedForScratch);
EXPECT_EQ(scratchController->privateScratchSizeBytes, scratchController->getPrivateScratchSpaceAllocation()->getUnderlyingBufferSize());
}