performance: Remove global fence allocation from BMG

Resolves: NEO-14642

Signed-off-by: Lukasz Jobczyk <lukasz.jobczyk@intel.com>
This commit is contained in:
Lukasz Jobczyk 2025-04-28 15:54:56 +00:00 committed by Compute-Runtime-Automation
parent 304fba1eba
commit c9a21c158e
6 changed files with 12 additions and 166 deletions

View File

@ -21,41 +21,6 @@ namespace ult {
using CommandQueueCommandsXe2HpgCore = Test<DeviceFixture>;
HWTEST2_F(CommandQueueCommandsXe2HpgCore, givenCommandQueueWhenExecutingCommandListsThenStateSystemMemFenceAddressCmdIsGenerated, IsXe2HpgCore) {
if (neoDevice->getHardwareInfo().capabilityTable.isIntegratedDevice) {
GTEST_SKIP();
}
using STATE_SYSTEM_MEM_FENCE_ADDRESS = typename FamilyType::STATE_SYSTEM_MEM_FENCE_ADDRESS;
ze_command_queue_desc_t desc = {};
auto csr = neoDevice->getDefaultEngine().commandStreamReceiver;
auto commandQueue = new MockCommandQueueHw<FamilyType::gfxCoreFamily>(device, csr, &desc);
commandQueue->initialize(false, false, false);
ze_result_t returnValue;
std::unique_ptr<L0::CommandList> commandList(CommandList::create(productFamily, device, NEO::EngineGroupType::compute, 0u, returnValue, false));
auto commandListHandle = commandList->toHandle();
commandList->close();
commandQueue->executeCommandLists(1, &commandListHandle, nullptr, false, nullptr, nullptr);
auto globalFence = csr->getGlobalFenceAllocation();
auto used = commandQueue->commandStream.getUsed();
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(
cmdList, commandQueue->commandStream.getCpuBase(), used));
auto itor = find<STATE_SYSTEM_MEM_FENCE_ADDRESS *>(cmdList.begin(), cmdList.end());
ASSERT_NE(cmdList.end(), itor);
auto systemMemFenceAddressCmd = genCmdCast<STATE_SYSTEM_MEM_FENCE_ADDRESS *>(*itor);
EXPECT_EQ(globalFence->getGpuAddress(), systemMemFenceAddressCmd->getSystemMemoryFenceAddress());
commandQueue->destroy();
}
HWTEST2_F(CommandQueueCommandsXe2HpgCore, givenCommandQueueWhenExecutingCommandListsForTheSecondTimeThenStateSystemMemFenceAddressCmdIsNotGenerated, IsXe2HpgCore) {
using STATE_SYSTEM_MEM_FENCE_ADDRESS = typename FamilyType::STATE_SYSTEM_MEM_FENCE_ADDRESS;
ze_command_queue_desc_t desc = {};

View File

@ -375,133 +375,6 @@ struct SystemMemoryFenceInDefaultConfigurationTest : public UltCommandStreamRece
DebugManagerStateRestore restore;
};
using SystemMemoryFenceInDefaultConfigurationTestXe2HpgCore = SystemMemoryFenceInDefaultConfigurationTest;
XE2_HPG_CORETEST_F(SystemMemoryFenceInDefaultConfigurationTestXe2HpgCore,
givenNoEventProvidedWhenEnqueueKernelNotUsingSystemMemoryThenNoPostSyncFenceRequestDispatched) {
using STATE_SYSTEM_MEM_FENCE_ADDRESS = typename FamilyType::STATE_SYSTEM_MEM_FENCE_ADDRESS;
using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER;
using MI_MEM_FENCE = typename FamilyType::MI_MEM_FENCE;
if (pClDevice->getHardwareInfo().capabilityTable.isIntegratedDevice) {
GTEST_SKIP();
}
MockKernelWithInternals kernel(*pClDevice);
MockContext context(pClDevice);
MockCommandQueueHw<FamilyType> commandQueue(&context, pClDevice, nullptr);
auto &commandStreamReceiver = pClDevice->getUltCommandStreamReceiver<FamilyType>();
size_t globalWorkSize[3] = {1, 1, 1};
commandQueue.enqueueKernel(kernel, 1, nullptr, globalWorkSize, nullptr, 0, nullptr, nullptr);
ClHardwareParse hwParser;
hwParser.parseCommands<FamilyType>(commandQueue);
auto itorSystemMemFenceAddress = find<STATE_SYSTEM_MEM_FENCE_ADDRESS *>(hwParser.cmdList.begin(), hwParser.cmdList.end());
ASSERT_NE(hwParser.cmdList.end(), itorSystemMemFenceAddress);
auto systemMemFenceAddressCmd = genCmdCast<STATE_SYSTEM_MEM_FENCE_ADDRESS *>(*itorSystemMemFenceAddress);
EXPECT_EQ(commandStreamReceiver.globalFenceAllocation->getGpuAddress(), systemMemFenceAddressCmd->getSystemMemoryFenceAddress());
auto itorComputeWalker = find<COMPUTE_WALKER *>(hwParser.cmdList.begin(), hwParser.cmdList.end());
ASSERT_NE(hwParser.cmdList.end(), itorComputeWalker);
auto walkerCmd = genCmdCast<COMPUTE_WALKER *>(*itorComputeWalker);
auto &postSyncData = walkerCmd->getPostSync();
EXPECT_FALSE(postSyncData.getSystemMemoryFenceRequest());
if (MemorySynchronizationCommands<FamilyType>::getSizeForAdditonalSynchronization(pClDevice->getRootDeviceEnvironment()) > 0) {
auto itorMiMemFence = find<MI_MEM_FENCE *>(hwParser.cmdList.begin(), hwParser.cmdList.end());
ASSERT_NE(hwParser.cmdList.end(), itorMiMemFence);
auto fenceCmd = genCmdCast<MI_MEM_FENCE *>(*itorMiMemFence);
ASSERT_NE(nullptr, fenceCmd);
EXPECT_EQ(MI_MEM_FENCE::FENCE_TYPE::FENCE_TYPE_RELEASE_FENCE, fenceCmd->getFenceType());
}
}
XE2_HPG_CORETEST_F(SystemMemoryFenceInDefaultConfigurationTestXe2HpgCore,
givenNoEventProvidedWhenEnqueueKernelUsingSystemMemoryThenPostSyncFenceRequestNotDispatched) {
using STATE_SYSTEM_MEM_FENCE_ADDRESS = typename FamilyType::STATE_SYSTEM_MEM_FENCE_ADDRESS;
using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER;
using MI_MEM_FENCE = typename FamilyType::MI_MEM_FENCE;
if (pClDevice->getHardwareInfo().capabilityTable.isIntegratedDevice) {
GTEST_SKIP();
}
MockKernelWithInternals kernel(*pClDevice);
MockContext context(pClDevice);
MockCommandQueueHw<FamilyType> commandQueue(&context, pClDevice, nullptr);
auto &commandStreamReceiver = pClDevice->getUltCommandStreamReceiver<FamilyType>();
size_t globalWorkSize[3] = {1, 1, 1};
kernel.mockKernel->anyKernelArgumentUsingSystemMemory = true;
commandQueue.enqueueKernel(kernel, 1, nullptr, globalWorkSize, nullptr, 0, nullptr, nullptr);
ClHardwareParse hwParser;
hwParser.parseCommands<FamilyType>(commandQueue);
auto itorSystemMemFenceAddress = find<STATE_SYSTEM_MEM_FENCE_ADDRESS *>(hwParser.cmdList.begin(), hwParser.cmdList.end());
ASSERT_NE(hwParser.cmdList.end(), itorSystemMemFenceAddress);
auto systemMemFenceAddressCmd = genCmdCast<STATE_SYSTEM_MEM_FENCE_ADDRESS *>(*itorSystemMemFenceAddress);
EXPECT_EQ(commandStreamReceiver.globalFenceAllocation->getGpuAddress(), systemMemFenceAddressCmd->getSystemMemoryFenceAddress());
auto itorComputeWalker = find<COMPUTE_WALKER *>(hwParser.cmdList.begin(), hwParser.cmdList.end());
ASSERT_NE(hwParser.cmdList.end(), itorComputeWalker);
auto walkerCmd = genCmdCast<COMPUTE_WALKER *>(*itorComputeWalker);
auto &postSyncData = walkerCmd->getPostSync();
EXPECT_FALSE(postSyncData.getSystemMemoryFenceRequest());
if (MemorySynchronizationCommands<FamilyType>::getSizeForAdditonalSynchronization(pClDevice->getRootDeviceEnvironment()) > 0) {
auto itorMiMemFence = find<MI_MEM_FENCE *>(hwParser.cmdList.begin(), hwParser.cmdList.end());
ASSERT_NE(hwParser.cmdList.end(), itorMiMemFence);
auto fenceCmd = genCmdCast<MI_MEM_FENCE *>(*itorMiMemFence);
ASSERT_NE(nullptr, fenceCmd);
EXPECT_EQ(MI_MEM_FENCE::FENCE_TYPE::FENCE_TYPE_RELEASE_FENCE, fenceCmd->getFenceType());
}
}
XE2_HPG_CORETEST_F(SystemMemoryFenceInDefaultConfigurationTestXe2HpgCore,
givenEventProvidedWhenEnqueueKernelNotUsingSystemMemoryThenPostSyncFenceRequestNotDispatched) {
using STATE_SYSTEM_MEM_FENCE_ADDRESS = typename FamilyType::STATE_SYSTEM_MEM_FENCE_ADDRESS;
using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER;
using MI_MEM_FENCE = typename FamilyType::MI_MEM_FENCE;
if (pClDevice->getHardwareInfo().capabilityTable.isIntegratedDevice) {
GTEST_SKIP();
}
MockKernelWithInternals kernel(*pClDevice);
MockContext context(pClDevice);
MockCommandQueueHw<FamilyType> commandQueue(&context, pClDevice, nullptr);
auto &commandStreamReceiver = pClDevice->getUltCommandStreamReceiver<FamilyType>();
size_t globalWorkSize[3] = {1, 1, 1};
cl_event kernelEvent{};
commandQueue.enqueueKernel(kernel, 1, nullptr, globalWorkSize, nullptr, 0, nullptr, &kernelEvent);
ClHardwareParse hwParser;
hwParser.parseCommands<FamilyType>(commandQueue);
auto itorSystemMemFenceAddress = find<STATE_SYSTEM_MEM_FENCE_ADDRESS *>(hwParser.cmdList.begin(), hwParser.cmdList.end());
ASSERT_NE(hwParser.cmdList.end(), itorSystemMemFenceAddress);
auto systemMemFenceAddressCmd = genCmdCast<STATE_SYSTEM_MEM_FENCE_ADDRESS *>(*itorSystemMemFenceAddress);
EXPECT_EQ(commandStreamReceiver.globalFenceAllocation->getGpuAddress(), systemMemFenceAddressCmd->getSystemMemoryFenceAddress());
auto itorComputeWalker = find<COMPUTE_WALKER *>(hwParser.cmdList.begin(), hwParser.cmdList.end());
ASSERT_NE(hwParser.cmdList.end(), itorComputeWalker);
auto walkerCmd = genCmdCast<COMPUTE_WALKER *>(*itorComputeWalker);
auto &postSyncData = walkerCmd->getPostSync();
EXPECT_FALSE(postSyncData.getSystemMemoryFenceRequest());
if (MemorySynchronizationCommands<FamilyType>::getSizeForAdditonalSynchronization(pClDevice->getRootDeviceEnvironment()) > 0) {
auto itorMiMemFence = find<MI_MEM_FENCE *>(hwParser.cmdList.begin(), hwParser.cmdList.end());
ASSERT_NE(hwParser.cmdList.end(), itorMiMemFence);
auto fenceCmd = genCmdCast<MI_MEM_FENCE *>(*itorMiMemFence);
ASSERT_NE(nullptr, fenceCmd);
EXPECT_EQ(MI_MEM_FENCE::FENCE_TYPE::FENCE_TYPE_RELEASE_FENCE, fenceCmd->getFenceType());
}
auto event = castToObject<Event>(kernelEvent);
event->release();
}
struct Xe2MidThreadCommandStreamReceiverTest : public UltCommandStreamReceiverTest {
void SetUp() override {
debugManager.flags.ForcePreemptionMode.set(static_cast<int32_t>(PreemptionMode::MidThread));

View File

@ -24,7 +24,9 @@ bool GfxCoreHelperHw<Family>::isFenceAllocationRequired(const HardwareInfo &hwIn
(debugManager.flags.DirectSubmissionInsertExtraMiMemFenceCommands.get() == 0)) {
return false;
}
return !hwInfo.capabilityTable.isIntegratedDevice;
return productHelper.isGlobalFenceInCommandStreamRequired(hwInfo) ||
productHelper.isGlobalFenceInDirectSubmissionRequired(hwInfo) ||
productHelper.isGlobalFenceInPostSyncRequired(hwInfo);
}
template <typename Family>

View File

@ -4472,7 +4472,7 @@ HWTEST2_F(CommandStreamReceiverHwTest,
givenImmediateFlushTaskWhenOneTimeContextSystemFenceRequiredThenExpectOneTimeSystemFenceCommand,
IsHeapfulSupportedAndAtLeastXeHpcCore) {
using STATE_SYSTEM_MEM_FENCE_ADDRESS = typename FamilyType::STATE_SYSTEM_MEM_FENCE_ADDRESS;
if (pDevice->getHardwareInfo().capabilityTable.isIntegratedDevice) {
if (!pDevice->getGfxCoreHelper().isFenceAllocationRequired(pDevice->getHardwareInfo(), pDevice->getProductHelper())) {
GTEST_SKIP();
}

View File

@ -219,10 +219,14 @@ HWTEST_F(DirectSubmissionDispatchMiMemFenceTest, givenDebugFlagSetToTrueWhenCrea
DebugManagerStateRestore restorer;
debugManager.flags.DirectSubmissionInsertExtraMiMemFenceCommands.set(1);
if (heaplessStateInit || pDevice->getHardwareInfo().capabilityTable.isIntegratedDevice) {
if (heaplessStateInit) {
GTEST_SKIP();
}
if (!pDevice->getDefaultEngine().commandStreamReceiver->getGlobalFenceAllocation()) {
pDevice->getDefaultEngine().commandStreamReceiver->createGlobalFenceAllocation();
}
MockDirectSubmissionHw<FamilyType, RenderDispatcher<FamilyType>> directSubmission(*pDevice->getDefaultEngine().commandStreamReceiver);
EXPECT_TRUE(directSubmission.miMemFenceRequired);
@ -232,6 +236,8 @@ HWTEST_F(DirectSubmissionDispatchMiMemFenceTest, givenDebugFlagSetToTrueWhenCrea
EXPECT_EQ(directSubmission.systemMemoryFenceAddressSet, directSubmission.globalFenceAllocation != nullptr);
EXPECT_TRUE(directSubmission.miMemFenceRequired);
reinterpret_cast<UltCommandStreamReceiver<FamilyType> *>(pDevice->getDefaultEngine().commandStreamReceiver)->cleanupResources();
}
HWCMDTEST_F(IGFX_XE_HP_CORE, DirectSubmissionDispatchBufferTest,

View File

@ -486,7 +486,7 @@ XE2_HPG_CORETEST_F(GfxCoreHelperTestsXe2HpgCore, givenGfxCoreHelperWhenAskedIfFe
debugManager.flags.ProgramGlobalFenceAsPostSyncOperationInComputeWalker.set(-1);
debugManager.flags.ProgramGlobalFenceAsKernelInstructionInEUKernel.set(-1);
debugManager.flags.DirectSubmissionInsertExtraMiMemFenceCommands.set(-1);
EXPECT_EQ(gfxCoreHelper.isFenceAllocationRequired(hwInfo, productHelper), !hwInfo.capabilityTable.isIntegratedDevice);
EXPECT_FALSE(gfxCoreHelper.isFenceAllocationRequired(hwInfo, productHelper));
debugManager.flags.ProgramGlobalFenceAsMiMemFenceCommandInCommandStream.set(0);
debugManager.flags.ProgramGlobalFenceAsPostSyncOperationInComputeWalker.set(0);