fix: correct region barrier size calculation

Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com>
This commit is contained in:
Bartosz Dunajski
2024-06-21 11:16:36 +00:00
committed by Compute-Runtime-Automation
parent 1854bc4a60
commit 8698e7fb43
4 changed files with 36 additions and 2 deletions

View File

@ -2768,7 +2768,16 @@ void CommandListCoreFamily<gfxCoreFamily>::programRegionGroupBarrier(Kernel &ker
auto neoDevice = device->getNEODevice();
neoDevice->allocateSyncBufferHandler();
auto patchData = neoDevice->syncBufferHandler->obtainAllocationAndOffset(MemoryConstants::cacheLineSize);
auto &gtSysInfo = device->getNEODevice()->getHardwareInfo().gtSystemInfo;
auto tileCount = std::max(gtSysInfo.MultiTileArchInfo.TileCount, uint8_t(1)); // Use physical count
constexpr size_t barrierSizePerSubslice = sizeof(uint64_t);
size_t size = alignUp(tileCount * gtSysInfo.MaxSubSlicesSupported * barrierSizePerSubslice, MemoryConstants::cacheLineSize);
auto patchData = neoDevice->syncBufferHandler->obtainAllocationAndOffset(size);
kernel.patchRegionGroupBarrier(patchData.first, patchData.second);
}

View File

@ -219,6 +219,7 @@ struct WhiteBox<L0::CommandListCoreFamilyImmediate<gfxCoreFamily>>
using BaseClass::partitionCount;
using BaseClass::pipeControlMultiKernelEventSync;
using BaseClass::pipelineSelectStateTracking;
using BaseClass::programRegionGroupBarrier;
using BaseClass::requiredStreamState;
using BaseClass::requiresQueueUncachedMocs;
using BaseClass::signalAllEventPackets;

View File

@ -13,6 +13,7 @@
#include "shared/source/kernel/kernel_descriptor.h"
#include "shared/source/memory_manager/internal_allocation_storage.h"
#include "shared/source/memory_manager/memory_manager.h"
#include "shared/source/program/sync_buffer_handler.h"
#include "shared/test/common/helpers/engine_descriptor_helper.h"
#include "shared/test/common/helpers/unit_test_helper.h"
#include "shared/test/common/libult/ult_command_stream_receiver.h"
@ -46,6 +47,27 @@ HWTEST2_F(MultiTileImmediateCommandListTest, GivenMultiTileDeviceWhenCreatingImm
EXPECT_EQ(2u, commandList->partitionCount);
}
HWTEST2_F(MultiTileImmediateCommandListTest, givenMultipleTilesWhenAllocatingBarrierSyncBufferThenEnsureCorrectSize, IsAtLeastXeHpCore) {
EXPECT_EQ(2u, device->getNEODevice()->getDeviceBitfield().count());
neoDevice->getExecutionEnvironment()->rootDeviceEnvironments[0]->getMutableHardwareInfo()->gtSystemInfo.MultiTileArchInfo.TileCount = 3;
Mock<KernelImp> mockKernel;
auto cmdListImmediate = static_cast<CommandListCoreFamilyImmediate<gfxCoreFamily> *>(static_cast<L0::CommandListImp *>(commandList.get()));
auto whiteBoxCmdList = static_cast<WhiteBox<::L0::CommandListCoreFamilyImmediate<gfxCoreFamily>> *>(cmdListImmediate);
whiteBoxCmdList->programRegionGroupBarrier(mockKernel);
auto patchData = neoDevice->syncBufferHandler->obtainAllocationAndOffset(1);
auto &hwInfo = device->getNEODevice()->getHardwareInfo();
size_t expectedOffset = alignUp(3 * hwInfo.gtSystemInfo.MaxSubSlicesSupported * sizeof(uint64_t), MemoryConstants::cacheLineSize);
EXPECT_EQ(patchData.second, expectedOffset);
}
using MultiTileImmediateInternalCommandListTest = Test<MultiTileCommandListFixture<true, true, false, -1>>;
HWTEST2_F(MultiTileImmediateInternalCommandListTest, GivenMultiTileDeviceWhenCreatingInternalImmediateCommandListThenExpectPartitionCountEqualOne, IsWithinXeGfxFamily) {

View File

@ -532,7 +532,9 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenKernelUsingRegionGroupBarrierWhenA
auto patchPtr2 = *reinterpret_cast<uint64_t *>(ptrOffset(kernel.crossThreadData.get(), regionGroupBarrier.stateless));
EXPECT_EQ(patchPtr2, patchPtr + MemoryConstants::cacheLineSize);
auto offset = alignUp(device->getHwInfo().gtSystemInfo.MaxSubSlicesSupported * sizeof(uint64_t), MemoryConstants::cacheLineSize);
EXPECT_EQ(patchPtr2, patchPtr + offset);
}
HWTEST2_F(CommandListAppendLaunchKernel, whenAppendLaunchCooperativeKernelAndQueryKernelTimestampsToTheSameCmdlistThenFronEndStateIsNotChanged, IsAtLeastSkl) {