performance: Do not create global fence allocation on integrated

Signed-off-by: Lukasz Jobczyk <lukasz.jobczyk@intel.com>
This commit is contained in:
Lukasz Jobczyk
2025-04-07 10:16:55 +00:00
committed by Compute-Runtime-Automation
parent d5f39fce5e
commit 8978ea5e5a
23 changed files with 151 additions and 66 deletions

View File

@@ -12,7 +12,9 @@
#include "shared/source/command_stream/preemption_mode.h"
#include "shared/source/command_stream/thread_arbitration_policy.h"
#include "shared/source/debugger/debugger.h"
#include "shared/source/device/device.h"
#include "shared/source/helpers/definitions/command_encoder_args.h"
#include "shared/source/helpers/hw_info.h"
#include "shared/source/helpers/register_offsets.h"
#include "shared/source/kernel/kernel_arg_descriptor.h"
#include "shared/source/kernel/kernel_execution_type.h"
@@ -93,7 +95,7 @@ struct EncodeDispatchKernelArgs {
bool isFlushL3AfterPostSyncForHostUsmRequired = false;
bool requiresSystemMemoryFence() const {
return (isHostScopeSignalEvent && isKernelUsingSystemAllocation);
return (isHostScopeSignalEvent && isKernelUsingSystemAllocation && !device->getHardwareInfo().capabilityTable.isIntegratedDevice);
}
};
@@ -279,7 +281,7 @@ struct EncodeDispatchKernel : public EncodeDispatchKernelBase<GfxFamily> {
template <typename WalkerType, typename InterfaceDescriptorType>
static void overrideDefaultValues(WalkerType &walkerCmd, InterfaceDescriptorType &interfaceDescriptor);
template <typename WalkerType>
static void encodeWalkerPostSyncFields(WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs);
static void encodeWalkerPostSyncFields(WalkerType &walkerCmd, const RootDeviceEnvironment &rootDeviceEnvironment, const EncodeWalkerArgs &walkerArgs);
template <typename WalkerType, typename InterfaceDescriptorType>
static void encodeComputeDispatchAllWalker(WalkerType &walkerCmd, const InterfaceDescriptorType *idd, const RootDeviceEnvironment &rootDeviceEnvironment, const EncodeWalkerArgs &walkerArgs);
};

View File

@@ -31,7 +31,7 @@ template void NEO::EncodeDispatchKernel<Family>::forceComputeWalkerPostSyncFlush
template void NEO::EncodeDispatchKernel<Family>::setWalkerRegionSettings<Family::DefaultWalkerType>(Family::DefaultWalkerType &walkerCmd, const NEO::Device &device, uint32_t partitionCount,
uint32_t workgroupSize, uint32_t threadGroupCount, uint32_t maxWgCountPerTile, bool requiredDispatchWalkOrder);
template void NEO::EncodeDispatchKernel<Family>::overrideDefaultValues<Family::DefaultWalkerType, Family::DefaultWalkerType::InterfaceDescriptorType>(Family::DefaultWalkerType &walkerCmd, Family::DefaultWalkerType::InterfaceDescriptorType &interfaceDescriptor);
template void NEO::EncodeDispatchKernel<Family>::encodeWalkerPostSyncFields<Family::DefaultWalkerType>(Family::DefaultWalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs);
template void NEO::EncodeDispatchKernel<Family>::encodeWalkerPostSyncFields<Family::DefaultWalkerType>(Family::DefaultWalkerType &walkerCmd, const RootDeviceEnvironment &rootDeviceEnvironment, const EncodeWalkerArgs &walkerArgs);
template void NEO::EncodeDispatchKernel<Family>::encodeComputeDispatchAllWalker<Family::DefaultWalkerType, Family::DefaultWalkerType::InterfaceDescriptorType>(Family::DefaultWalkerType &walkerCmd, const Family::DefaultWalkerType::InterfaceDescriptorType *idd, const RootDeviceEnvironment &rootDeviceEnvironment, const EncodeWalkerArgs &walkerArgs);
template struct NEO::EncodeStates<Family>;

View File

@@ -415,11 +415,11 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
.requiredDispatchWalkOrder = args.requiredDispatchWalkOrder,
.localRegionSize = args.localRegionSize,
.maxFrontEndThreads = args.device->getDeviceInfo().maxFrontEndThreads,
.requiredSystemFence = args.requiresSystemMemoryFence(),
.requiredSystemFence = args.requiresSystemMemoryFence() && args.device->getGfxCoreHelper().isFenceAllocationRequired(hwInfo),
.hasSample = kernelDescriptor.kernelAttributes.flags.hasSample};
EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields(rootDeviceEnvironment, walkerCmd, walkerArgs);
EncodeDispatchKernel<Family>::encodeWalkerPostSyncFields(walkerCmd, walkerArgs);
EncodeDispatchKernel<Family>::encodeWalkerPostSyncFields(walkerCmd, rootDeviceEnvironment, walkerArgs);
EncodeDispatchKernel<Family>::encodeComputeDispatchAllWalker(walkerCmd, &idd, rootDeviceEnvironment, walkerArgs);
EncodeDispatchKernel<Family>::overrideDefaultValues(walkerCmd, idd);
@@ -1176,8 +1176,8 @@ void EncodeDispatchKernel<Family>::encodeThreadGroupDispatch(InterfaceDescriptor
template <typename Family>
template <typename WalkerType>
void EncodeDispatchKernel<Family>::encodeWalkerPostSyncFields(WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs) {
auto programGlobalFenceAsPostSyncOperationInComputeWalker = walkerArgs.requiredSystemFence;
void EncodeDispatchKernel<Family>::encodeWalkerPostSyncFields(WalkerType &walkerCmd, const RootDeviceEnvironment &rootDeviceEnvironment, const EncodeWalkerArgs &walkerArgs) {
auto programGlobalFenceAsPostSyncOperationInComputeWalker = !rootDeviceEnvironment.getHardwareInfo()->capabilityTable.isIntegratedDevice && walkerArgs.requiredSystemFence;
int32_t overrideProgramSystemMemoryFence = debugManager.flags.ProgramGlobalFenceAsPostSyncOperationInComputeWalker.get();
if (overrideProgramSystemMemoryFence != -1) {
programGlobalFenceAsPostSyncOperationInComputeWalker = !!overrideProgramSystemMemoryFence;

View File

@@ -303,7 +303,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
using INTERFACE_DESCRIPTOR_DATA = typename Family::INTERFACE_DESCRIPTOR_DATA;
EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields(rootDeviceEnvironment, cmd, walkerArgs);
EncodeDispatchKernel<Family>::encodeWalkerPostSyncFields(cmd, walkerArgs);
EncodeDispatchKernel<Family>::encodeWalkerPostSyncFields(cmd, rootDeviceEnvironment, walkerArgs);
EncodeDispatchKernel<Family>::template encodeComputeDispatchAllWalker<WalkerType, INTERFACE_DESCRIPTOR_DATA>(cmd, nullptr, rootDeviceEnvironment, walkerArgs);
memcpy_s(iddPtr, sizeof(idd), &idd, sizeof(idd));
@@ -434,7 +434,7 @@ inline void EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields(const Roo
template <typename Family>
template <typename WalkerType>
inline void EncodeDispatchKernel<Family>::encodeWalkerPostSyncFields(WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs) {}
inline void EncodeDispatchKernel<Family>::encodeWalkerPostSyncFields(WalkerType &walkerCmd, const RootDeviceEnvironment &rootDeviceEnvironment, const EncodeWalkerArgs &walkerArgs) {}
template <typename Family>
template <typename WalkerType, typename InterfaceDescriptorType>

View File

@@ -12,12 +12,19 @@ namespace NEO {
template <typename Family>
bool GfxCoreHelperHw<Family>::isFenceAllocationRequired(const HardwareInfo &hwInfo) const {
if ((debugManager.flags.ProgramGlobalFenceAsMiMemFenceCommandInCommandStream.get() == 1) ||
(debugManager.flags.ProgramGlobalFenceAsPostSyncOperationInComputeWalker.get() == 1) ||
(debugManager.flags.ProgramGlobalFenceAsKernelInstructionInEUKernel.get() == 1) ||
(debugManager.flags.DirectSubmissionInsertExtraMiMemFenceCommands.get() == 1)) {
return true;
}
if ((debugManager.flags.ProgramGlobalFenceAsMiMemFenceCommandInCommandStream.get() == 0) &&
(debugManager.flags.ProgramGlobalFenceAsPostSyncOperationInComputeWalker.get() == 0) &&
(debugManager.flags.ProgramGlobalFenceAsKernelInstructionInEUKernel.get() == 0)) {
(debugManager.flags.ProgramGlobalFenceAsKernelInstructionInEUKernel.get() == 0) &&
(debugManager.flags.DirectSubmissionInsertExtraMiMemFenceCommands.get() == 0)) {
return false;
}
return true;
return !hwInfo.capabilityTable.isIntegratedDevice;
}
template <typename Family>

View File

@@ -59,7 +59,7 @@ void EncodeDispatchKernel<Family>::programBarrierEnable(INTERFACE_DESCRIPTOR_DAT
template <>
template <typename WalkerType>
void EncodeDispatchKernel<Family>::encodeWalkerPostSyncFields(WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs) {}
void EncodeDispatchKernel<Family>::encodeWalkerPostSyncFields(WalkerType &walkerCmd, const RootDeviceEnvironment &rootDeviceEnvironment, const EncodeWalkerArgs &walkerArgs) {}
template <>
template <typename WalkerType, typename InterfaceDescriptorType>

View File

@@ -4472,6 +4472,9 @@ HWTEST2_F(CommandStreamReceiverHwTest,
givenImmediateFlushTaskWhenOneTimeContextSystemFenceRequiredThenExpectOneTimeSystemFenceCommand,
IsHeapfulSupportedAndAtLeastXeHpcCore) {
using STATE_SYSTEM_MEM_FENCE_ADDRESS = typename FamilyType::STATE_SYSTEM_MEM_FENCE_ADDRESS;
if (pDevice->getHardwareInfo().capabilityTable.isIntegratedDevice) {
GTEST_SKIP();
}
auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver<FamilyType>();
commandStreamReceiver.storeMakeResidentAllocations = true;

View File

@@ -199,7 +199,7 @@ HWTEST_F(DirectSubmissionDispatchMiMemFenceTest, givenDebugFlagSetToTrueWhenCrea
DebugManagerStateRestore restorer;
debugManager.flags.DirectSubmissionInsertExtraMiMemFenceCommands.set(1);
if (heaplessStateInit) {
if (heaplessStateInit || pDevice->getHardwareInfo().capabilityTable.isIntegratedDevice) {
GTEST_SKIP();
}

View File

@@ -17,6 +17,7 @@
#include "shared/source/os_interface/windows/wddm_residency_controller.h"
#include "shared/test/common/cmd_parse/hw_parse.h"
#include "shared/test/common/helpers/debug_manager_state_restore.h"
#include "shared/test/common/helpers/memory_management.h"
#include "shared/test/common/helpers/unit_test_helper.h"
#include "shared/test/common/helpers/variable_backup.h"
#include "shared/test/common/mocks/mock_io_functions.h"
@@ -86,7 +87,6 @@ using WddmDirectSubmissionWithMockGdiDllTest = Test<WddmDirectSubmissionWithMock
HWTEST_F(WddmDirectSubmissionTest, givenWddmWhenDirectIsInitializedAndStartedThenExpectProperCommandsDispatched) {
DebugManagerStateRestore restorer;
debugManager.flags.DirectSubmissionInsertExtraMiMemFenceCommands.set(0);
std::unique_ptr<MockWddmDirectSubmission<FamilyType, RenderDispatcher<FamilyType>>> wddmDirectSubmission =
std::make_unique<MockWddmDirectSubmission<FamilyType, RenderDispatcher<FamilyType>>>(*device->getDefaultEngine().commandStreamReceiver);
@@ -120,9 +120,15 @@ HWTEST_F(WddmDirectSubmissionTest, givenWddmWhenDirectIsInitializedAndStartedThe
EXPECT_EQ(1u, wddmMockInterface->destroyMonitorFenceCalled);
}
HWTEST_F(WddmDirectSubmissionTest, givenWddmWhenDirectIsInitializedWithMiMemFenceSupportedThenMakeGlobalFenceResident) {
struct WddmDirectSubmissionGlobalFenceTest : public WddmDirectSubmissionTest {
void SetUp() override {
debugManager.flags.DirectSubmissionInsertExtraMiMemFenceCommands.set(1);
WddmDirectSubmissionTest::SetUp();
}
DebugManagerStateRestore restorer;
debugManager.flags.DirectSubmissionInsertExtraMiMemFenceCommands.set(1);
};
HWTEST_F(WddmDirectSubmissionGlobalFenceTest, givenWddmWhenDirectIsInitializedWithMiMemFenceSupportedThenMakeGlobalFenceResident) {
std::unique_ptr<MockWddmDirectSubmission<FamilyType, RenderDispatcher<FamilyType>>> wddmDirectSubmission =
std::make_unique<MockWddmDirectSubmission<FamilyType, RenderDispatcher<FamilyType>>>(*device->getDefaultEngine().commandStreamReceiver);

View File

@@ -484,26 +484,37 @@ XE2_HPG_CORETEST_F(GfxCoreHelperTestsXe2HpgCore, givenGfxCoreHelperWhenAskedIfFe
debugManager.flags.ProgramGlobalFenceAsMiMemFenceCommandInCommandStream.set(-1);
debugManager.flags.ProgramGlobalFenceAsPostSyncOperationInComputeWalker.set(-1);
debugManager.flags.ProgramGlobalFenceAsKernelInstructionInEUKernel.set(-1);
EXPECT_TRUE(gfxCoreHelper.isFenceAllocationRequired(hwInfo));
debugManager.flags.DirectSubmissionInsertExtraMiMemFenceCommands.set(-1);
EXPECT_EQ(gfxCoreHelper.isFenceAllocationRequired(hwInfo), !hwInfo.capabilityTable.isIntegratedDevice);
debugManager.flags.ProgramGlobalFenceAsMiMemFenceCommandInCommandStream.set(0);
debugManager.flags.ProgramGlobalFenceAsPostSyncOperationInComputeWalker.set(0);
debugManager.flags.ProgramGlobalFenceAsKernelInstructionInEUKernel.set(0);
debugManager.flags.DirectSubmissionInsertExtraMiMemFenceCommands.set(0);
EXPECT_FALSE(gfxCoreHelper.isFenceAllocationRequired(hwInfo));
debugManager.flags.ProgramGlobalFenceAsMiMemFenceCommandInCommandStream.set(1);
debugManager.flags.ProgramGlobalFenceAsPostSyncOperationInComputeWalker.set(0);
debugManager.flags.ProgramGlobalFenceAsKernelInstructionInEUKernel.set(0);
debugManager.flags.DirectSubmissionInsertExtraMiMemFenceCommands.set(0);
EXPECT_TRUE(gfxCoreHelper.isFenceAllocationRequired(hwInfo));
debugManager.flags.ProgramGlobalFenceAsMiMemFenceCommandInCommandStream.set(0);
debugManager.flags.ProgramGlobalFenceAsPostSyncOperationInComputeWalker.set(1);
debugManager.flags.ProgramGlobalFenceAsKernelInstructionInEUKernel.set(0);
debugManager.flags.DirectSubmissionInsertExtraMiMemFenceCommands.set(0);
EXPECT_TRUE(gfxCoreHelper.isFenceAllocationRequired(hwInfo));
debugManager.flags.ProgramGlobalFenceAsMiMemFenceCommandInCommandStream.set(0);
debugManager.flags.ProgramGlobalFenceAsPostSyncOperationInComputeWalker.set(0);
debugManager.flags.ProgramGlobalFenceAsKernelInstructionInEUKernel.set(1);
debugManager.flags.DirectSubmissionInsertExtraMiMemFenceCommands.set(0);
EXPECT_TRUE(gfxCoreHelper.isFenceAllocationRequired(hwInfo));
debugManager.flags.ProgramGlobalFenceAsMiMemFenceCommandInCommandStream.set(0);
debugManager.flags.ProgramGlobalFenceAsPostSyncOperationInComputeWalker.set(0);
debugManager.flags.ProgramGlobalFenceAsKernelInstructionInEUKernel.set(0);
debugManager.flags.DirectSubmissionInsertExtraMiMemFenceCommands.set(1);
EXPECT_TRUE(gfxCoreHelper.isFenceAllocationRequired(hwInfo));
}

View File

@@ -438,7 +438,7 @@ XE2_HPG_CORETEST_F(EncodeKernelXe2HpgCoreTest, givenDefaultSettingForFenceWhenKe
auto walkerCmd = genCmdCast<DefaultWalkerType *>(*itor);
auto &postSyncData = walkerCmd->getPostSync();
EXPECT_TRUE(postSyncData.getSystemMemoryFenceRequest());
EXPECT_EQ(postSyncData.getSystemMemoryFenceRequest(), !pDevice->getHardwareInfo().capabilityTable.isIntegratedDevice);
}
XE2_HPG_CORETEST_F(EncodeKernelXe2HpgCoreTest, givenCleanHeapsAndSlmNotChangedAndUncachedMocsRequestedThenSBAIsProgrammedAndMocsAreSet) {

View File

@@ -486,26 +486,37 @@ XE3_CORETEST_F(GfxCoreHelperTestsXe3Core, givenGfxCoreHelperWhenAskedIfFenceAllo
debugManager.flags.ProgramGlobalFenceAsMiMemFenceCommandInCommandStream.set(-1);
debugManager.flags.ProgramGlobalFenceAsPostSyncOperationInComputeWalker.set(-1);
debugManager.flags.ProgramGlobalFenceAsKernelInstructionInEUKernel.set(-1);
EXPECT_TRUE(gfxCoreHelper.isFenceAllocationRequired(hwInfo));
debugManager.flags.DirectSubmissionInsertExtraMiMemFenceCommands.set(-1);
EXPECT_EQ(gfxCoreHelper.isFenceAllocationRequired(hwInfo), !hwInfo.capabilityTable.isIntegratedDevice);
debugManager.flags.ProgramGlobalFenceAsMiMemFenceCommandInCommandStream.set(0);
debugManager.flags.ProgramGlobalFenceAsPostSyncOperationInComputeWalker.set(0);
debugManager.flags.ProgramGlobalFenceAsKernelInstructionInEUKernel.set(0);
debugManager.flags.DirectSubmissionInsertExtraMiMemFenceCommands.set(0);
EXPECT_FALSE(gfxCoreHelper.isFenceAllocationRequired(hwInfo));
debugManager.flags.ProgramGlobalFenceAsMiMemFenceCommandInCommandStream.set(1);
debugManager.flags.ProgramGlobalFenceAsPostSyncOperationInComputeWalker.set(0);
debugManager.flags.ProgramGlobalFenceAsKernelInstructionInEUKernel.set(0);
debugManager.flags.DirectSubmissionInsertExtraMiMemFenceCommands.set(0);
EXPECT_TRUE(gfxCoreHelper.isFenceAllocationRequired(hwInfo));
debugManager.flags.ProgramGlobalFenceAsMiMemFenceCommandInCommandStream.set(0);
debugManager.flags.ProgramGlobalFenceAsPostSyncOperationInComputeWalker.set(1);
debugManager.flags.ProgramGlobalFenceAsKernelInstructionInEUKernel.set(0);
debugManager.flags.DirectSubmissionInsertExtraMiMemFenceCommands.set(0);
EXPECT_TRUE(gfxCoreHelper.isFenceAllocationRequired(hwInfo));
debugManager.flags.ProgramGlobalFenceAsMiMemFenceCommandInCommandStream.set(0);
debugManager.flags.ProgramGlobalFenceAsPostSyncOperationInComputeWalker.set(0);
debugManager.flags.ProgramGlobalFenceAsKernelInstructionInEUKernel.set(1);
debugManager.flags.DirectSubmissionInsertExtraMiMemFenceCommands.set(0);
EXPECT_TRUE(gfxCoreHelper.isFenceAllocationRequired(hwInfo));
debugManager.flags.ProgramGlobalFenceAsMiMemFenceCommandInCommandStream.set(0);
debugManager.flags.ProgramGlobalFenceAsPostSyncOperationInComputeWalker.set(0);
debugManager.flags.ProgramGlobalFenceAsKernelInstructionInEUKernel.set(0);
debugManager.flags.DirectSubmissionInsertExtraMiMemFenceCommands.set(1);
EXPECT_TRUE(gfxCoreHelper.isFenceAllocationRequired(hwInfo));
}

View File

@@ -379,7 +379,7 @@ XE3_CORETEST_F(EncodeKernelXe3CoreTest, givenDefaultSettingForFenceWhenKernelUse
auto walkerCmd = genCmdCast<DefaultWalkerType *>(*itor);
auto &postSyncData = walkerCmd->getPostSync();
EXPECT_TRUE(postSyncData.getSystemMemoryFenceRequest());
EXPECT_EQ(postSyncData.getSystemMemoryFenceRequest(), !pDevice->getHardwareInfo().capabilityTable.isIntegratedDevice);
}
XE3_CORETEST_F(EncodeKernelXe3CoreTest, givenDebugFlagSetWhenSetPropertiesAllCalledThenDisablePipelinedThreadArbitrationPolicy) {

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2022-2024 Intel Corporation
* Copyright (C) 2022-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -56,7 +56,7 @@ PVCTEST_F(WalkerDispatchTestsPvc, givenPvcWhenEncodeAdditionalWalkerFieldsThenPo
testInput.programGlobalFenceAsPostSyncOperationInComputeWalker);
postSyncData.setSystemMemoryFenceRequest(false);
EncodeDispatchKernel<FamilyType>::encodeWalkerPostSyncFields(walkerCmd, walkerArgs);
EncodeDispatchKernel<FamilyType>::encodeWalkerPostSyncFields(walkerCmd, rootDeviceEnvironment, walkerArgs);
EXPECT_EQ(testInput.expectSystemMemoryFenceRequest, postSyncData.getSystemMemoryFenceRequest());
}
}
@@ -78,7 +78,7 @@ PVCTEST_F(WalkerDispatchTestsPvc, givenPvcSupportsSystemMemoryFenceWhenNoSystemF
hwInfo.platform.usDeviceID = deviceId;
postSyncData.setSystemMemoryFenceRequest(true);
EncodeDispatchKernel<FamilyType>::encodeWalkerPostSyncFields(walkerCmd, walkerArgs);
EncodeDispatchKernel<FamilyType>::encodeWalkerPostSyncFields(walkerCmd, rootDeviceEnvironment, walkerArgs);
EXPECT_FALSE(postSyncData.getSystemMemoryFenceRequest());
}
}