refactor: change additional walker fields encoder 4/n

- move post sync system fence into dedicated encoder

Related-To: NEO-12639

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz 2024-10-31 09:06:05 +00:00 committed by Compute-Runtime-Automation
parent 64061b623b
commit 32fd00e150
10 changed files with 38 additions and 60 deletions

View File

@ -148,6 +148,7 @@ inline void HardwareInterface<GfxFamily>::programWalker(
EncodeWalkerArgs encodeWalkerArgs{kernel.getExecutionType(), requiredSystemFence, kernelInfo.kernelDescriptor, kernelAttributes.walkOrder, kernelAttributes.additionalSize, maxFrontEndThreads}; EncodeWalkerArgs encodeWalkerArgs{kernel.getExecutionType(), requiredSystemFence, kernelInfo.kernelDescriptor, kernelAttributes.walkOrder, kernelAttributes.additionalSize, maxFrontEndThreads};
EncodeDispatchKernel<GfxFamily>::template encodeAdditionalWalkerFields<WalkerType>(rootDeviceEnvironment, walkerCmd, encodeWalkerArgs); EncodeDispatchKernel<GfxFamily>::template encodeAdditionalWalkerFields<WalkerType>(rootDeviceEnvironment, walkerCmd, encodeWalkerArgs);
EncodeDispatchKernel<GfxFamily>::template encodeWalkerPostSyncFields<WalkerType>(walkerCmd, encodeWalkerArgs);
EncodeDispatchKernel<GfxFamily>::template overrideDefaultValues<WalkerType, InterfaceDescriptorType>(walkerCmd, *interfaceDescriptor); EncodeDispatchKernel<GfxFamily>::template overrideDefaultValues<WalkerType, InterfaceDescriptorType>(walkerCmd, *interfaceDescriptor);
auto devices = queueCsr.getOsContext().getDeviceBitfield(); auto devices = queueCsr.getOsContext().getDeviceBitfield();

View File

@ -229,6 +229,8 @@ struct EncodeDispatchKernel {
template <typename WalkerType, typename InterfaceDescriptorType> template <typename WalkerType, typename InterfaceDescriptorType>
static void overrideDefaultValues(WalkerType &walkerCmd, InterfaceDescriptorType &interfaceDescriptor); static void overrideDefaultValues(WalkerType &walkerCmd, InterfaceDescriptorType &interfaceDescriptor);
template <typename WalkerType>
static void encodeWalkerPostSyncFields(WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs);
}; };
template <typename GfxFamily> template <typename GfxFamily>

View File

@ -278,6 +278,15 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
auto threadGroupCount = cmd.getThreadGroupIdXDimension() * cmd.getThreadGroupIdYDimension() * cmd.getThreadGroupIdZDimension(); auto threadGroupCount = cmd.getThreadGroupIdXDimension() * cmd.getThreadGroupIdYDimension() * cmd.getThreadGroupIdZDimension();
EncodeDispatchKernel<Family>::encodeThreadGroupDispatch(idd, *args.device, hwInfo, threadGroupDims, threadGroupCount, kernelDescriptor.kernelAttributes.numGrfRequired, numThreadsPerThreadGroup, cmd); EncodeDispatchKernel<Family>::encodeThreadGroupDispatch(idd, *args.device, hwInfo, threadGroupDims, threadGroupCount, kernelDescriptor.kernelAttributes.numGrfRequired, numThreadsPerThreadGroup, cmd);
EncodeWalkerArgs walkerArgs{
KernelExecutionType::defaultType,
args.requiresSystemMemoryFence(),
kernelDescriptor,
args.requiredDispatchWalkOrder,
args.additionalSizeParam,
args.device->getDeviceInfo().maxFrontEndThreads};
EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields(rootDeviceEnvironment, cmd, walkerArgs);
EncodeDispatchKernel<Family>::encodeWalkerPostSyncFields(cmd, walkerArgs);
memcpy_s(iddPtr, sizeof(idd), &idd, sizeof(idd)); memcpy_s(iddPtr, sizeof(idd), &idd, sizeof(idd));
@ -404,6 +413,10 @@ template <typename Family>
template <typename WalkerType> template <typename WalkerType>
inline void EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields(const RootDeviceEnvironment &rootDeviceEnvironment, WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs) {} inline void EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields(const RootDeviceEnvironment &rootDeviceEnvironment, WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs) {}
template <typename Family>
template <typename WalkerType>
inline void NEO::EncodeDispatchKernel<Family>::encodeWalkerPostSyncFields(WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs) {}
template <typename Family> template <typename Family>
template <typename InterfaceDescriptorType> template <typename InterfaceDescriptorType>
void EncodeDispatchKernel<Family>::setupPreferredSlmSize(InterfaceDescriptorType *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy) {} void EncodeDispatchKernel<Family>::setupPreferredSlmSize(InterfaceDescriptorType *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy) {}

View File

@ -32,6 +32,7 @@ template void NEO::EncodeDispatchKernel<Family>::forceComputeWalkerPostSyncFlush
template void NEO::EncodeDispatchKernel<Family>::setWalkerRegionSettings<Family::DefaultWalkerType>(Family::DefaultWalkerType &walkerCmd, const HardwareInfo &hwInfo, uint32_t partitionCount, template void NEO::EncodeDispatchKernel<Family>::setWalkerRegionSettings<Family::DefaultWalkerType>(Family::DefaultWalkerType &walkerCmd, const HardwareInfo &hwInfo, uint32_t partitionCount,
uint32_t workgroupSize, uint32_t maxWgCountPerTile, bool requiredWalkOrder); uint32_t workgroupSize, uint32_t maxWgCountPerTile, bool requiredWalkOrder);
template void NEO::EncodeDispatchKernel<Family>::overrideDefaultValues<Family::DefaultWalkerType, Family::INTERFACE_DESCRIPTOR_DATA>(Family::DefaultWalkerType &walkerCmd, Family::INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor); template void NEO::EncodeDispatchKernel<Family>::overrideDefaultValues<Family::DefaultWalkerType, Family::INTERFACE_DESCRIPTOR_DATA>(Family::DefaultWalkerType &walkerCmd, Family::INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor);
template void NEO::EncodeDispatchKernel<Family>::encodeWalkerPostSyncFields<Family::DefaultWalkerType>(Family::DefaultWalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs);
template struct NEO::EncodeStates<Family>; template struct NEO::EncodeStates<Family>;
template struct NEO::EncodeMath<Family>; template struct NEO::EncodeMath<Family>;

View File

@ -412,6 +412,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
args.additionalSizeParam, args.additionalSizeParam,
args.device->getDeviceInfo().maxFrontEndThreads}; args.device->getDeviceInfo().maxFrontEndThreads};
EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields(rootDeviceEnvironment, walkerCmd, walkerArgs); EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields(rootDeviceEnvironment, walkerCmd, walkerArgs);
EncodeDispatchKernel<Family>::encodeWalkerPostSyncFields(walkerCmd, walkerArgs);
EncodeDispatchKernel<Family>::overrideDefaultValues(walkerCmd, idd); EncodeDispatchKernel<Family>::overrideDefaultValues(walkerCmd, idd);
@ -1235,4 +1236,16 @@ void EncodeDispatchKernel<Family>::encodeThreadGroupDispatch(InterfaceDescriptor
} }
} }
template <typename Family>
template <typename WalkerType>
void EncodeDispatchKernel<Family>::encodeWalkerPostSyncFields(WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs) {
auto programGlobalFenceAsPostSyncOperationInComputeWalker = walkerArgs.requiredSystemFence;
int32_t overrideProgramSystemMemoryFence = debugManager.flags.ProgramGlobalFenceAsPostSyncOperationInComputeWalker.get();
if (overrideProgramSystemMemoryFence != -1) {
programGlobalFenceAsPostSyncOperationInComputeWalker = !!overrideProgramSystemMemoryFence;
}
auto &postSyncData = walkerCmd.getPostSync();
postSyncData.setSystemMemoryFenceRequest(programGlobalFenceAsPostSyncOperationInComputeWalker);
}
} // namespace NEO } // namespace NEO

View File

@ -232,14 +232,6 @@ void EncodeSurfaceState<Family>::disableCompressionFlags(R_SURFACE_STATE *surfac
template <> template <>
template <typename WalkerType> template <typename WalkerType>
void EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields(const RootDeviceEnvironment &rootDeviceEnvironment, WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs) { void EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields(const RootDeviceEnvironment &rootDeviceEnvironment, WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs) {
auto programGlobalFenceAsPostSyncOperationInComputeWalker = walkerArgs.requiredSystemFence;
int32_t overrideProgramSystemMemoryFence = debugManager.flags.ProgramGlobalFenceAsPostSyncOperationInComputeWalker.get();
if (overrideProgramSystemMemoryFence != -1) {
programGlobalFenceAsPostSyncOperationInComputeWalker = !!overrideProgramSystemMemoryFence;
}
auto &postSyncData = walkerCmd.getPostSync();
postSyncData.setSystemMemoryFenceRequest(programGlobalFenceAsPostSyncOperationInComputeWalker);
bool computeDispatchAllWalkerEnable = walkerArgs.kernelExecutionType == KernelExecutionType::concurrent; bool computeDispatchAllWalkerEnable = walkerArgs.kernelExecutionType == KernelExecutionType::concurrent;
int32_t overrideComputeDispatchAllWalkerEnable = debugManager.flags.ComputeDispatchAllWalkerEnableInComputeWalker.get(); int32_t overrideComputeDispatchAllWalkerEnable = debugManager.flags.ComputeDispatchAllWalkerEnableInComputeWalker.get();
if (overrideComputeDispatchAllWalkerEnable != -1) { if (overrideComputeDispatchAllWalkerEnable != -1) {

View File

@ -163,17 +163,6 @@ void EncodeDispatchKernel<Family>::programBarrierEnable(INTERFACE_DESCRIPTOR_DAT
template <> template <>
template <typename WalkerType> template <typename WalkerType>
void EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields(const RootDeviceEnvironment &rootDeviceEnvironment, WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs) { void EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields(const RootDeviceEnvironment &rootDeviceEnvironment, WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs) {
const auto &productHelper = rootDeviceEnvironment.getHelper<ProductHelper>();
auto &hwInfo = *rootDeviceEnvironment.getHardwareInfo();
auto programGlobalFenceAsPostSyncOperationInComputeWalker = productHelper.isGlobalFenceInCommandStreamRequired(hwInfo) &&
walkerArgs.requiredSystemFence;
int32_t overrideProgramSystemMemoryFence = debugManager.flags.ProgramGlobalFenceAsPostSyncOperationInComputeWalker.get();
if (overrideProgramSystemMemoryFence != -1) {
programGlobalFenceAsPostSyncOperationInComputeWalker = !!overrideProgramSystemMemoryFence;
}
auto &postSyncData = walkerCmd.getPostSync();
postSyncData.setSystemMemoryFenceRequest(programGlobalFenceAsPostSyncOperationInComputeWalker);
int32_t overrideDispatchAllWalkerEnableInComputeWalker = debugManager.flags.ComputeDispatchAllWalkerEnableInComputeWalker.get(); int32_t overrideDispatchAllWalkerEnableInComputeWalker = debugManager.flags.ComputeDispatchAllWalkerEnableInComputeWalker.get();
if (overrideDispatchAllWalkerEnableInComputeWalker != -1) { if (overrideDispatchAllWalkerEnableInComputeWalker != -1) {
walkerCmd.setComputeDispatchAllWalkerEnable(overrideDispatchAllWalkerEnableInComputeWalker); walkerCmd.setComputeDispatchAllWalkerEnable(overrideDispatchAllWalkerEnableInComputeWalker);

View File

@ -58,8 +58,11 @@ void EncodeDispatchKernel<Family>::programBarrierEnable(INTERFACE_DESCRIPTOR_DAT
template <> template <>
template <typename WalkerType> template <typename WalkerType>
void EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields(const RootDeviceEnvironment &rootDeviceEnvironment, WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs) { void EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields(const RootDeviceEnvironment &rootDeviceEnvironment, WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs) {}
}
template <>
template <typename WalkerType>
void EncodeDispatchKernel<Family>::encodeWalkerPostSyncFields(WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs) {}
template <> template <>
void EncodeComputeMode<Family>::programComputeModeCommand(LinearStream &csr, StateComputeModeProperties &properties, const RootDeviceEnvironment &rootDeviceEnvironment) { void EncodeComputeMode<Family>::programComputeModeCommand(LinearStream &csr, StateComputeModeProperties &properties, const RootDeviceEnvironment &rootDeviceEnvironment) {

View File

@ -27,7 +27,7 @@ PVCTEST_F(WalkerDispatchTestsPvc, givenPvcWhenEncodeAdditionalWalkerFieldsThenPo
int32_t programGlobalFenceAsPostSyncOperationInComputeWalker; int32_t programGlobalFenceAsPostSyncOperationInComputeWalker;
bool expectSystemMemoryFenceRequest; bool expectSystemMemoryFenceRequest;
} testInputs[] = { } testInputs[] = {
{0x0, -1, false}, {0x0, -1, true},
{0x3, -1, true}, {0x3, -1, true},
{0x0, 0, false}, {0x0, 0, false},
{0x3, 0, false}, {0x3, 0, false},
@ -53,7 +53,7 @@ PVCTEST_F(WalkerDispatchTestsPvc, givenPvcWhenEncodeAdditionalWalkerFieldsThenPo
testInput.programGlobalFenceAsPostSyncOperationInComputeWalker); testInput.programGlobalFenceAsPostSyncOperationInComputeWalker);
postSyncData.setSystemMemoryFenceRequest(false); postSyncData.setSystemMemoryFenceRequest(false);
EncodeDispatchKernel<FamilyType>::encodeAdditionalWalkerFields(rootDeviceEnvironment, walkerCmd, walkerArgs); EncodeDispatchKernel<FamilyType>::encodeWalkerPostSyncFields(walkerCmd, walkerArgs);
EXPECT_EQ(testInput.expectSystemMemoryFenceRequest, postSyncData.getSystemMemoryFenceRequest()); EXPECT_EQ(testInput.expectSystemMemoryFenceRequest, postSyncData.getSystemMemoryFenceRequest());
} }
} }
@ -75,7 +75,7 @@ PVCTEST_F(WalkerDispatchTestsPvc, givenPvcSupportsSystemMemoryFenceWhenNoSystemF
hwInfo.platform.usDeviceID = deviceId; hwInfo.platform.usDeviceID = deviceId;
postSyncData.setSystemMemoryFenceRequest(true); postSyncData.setSystemMemoryFenceRequest(true);
EncodeDispatchKernel<FamilyType>::encodeAdditionalWalkerFields(rootDeviceEnvironment, walkerCmd, walkerArgs); EncodeDispatchKernel<FamilyType>::encodeWalkerPostSyncFields(walkerCmd, walkerArgs);
EXPECT_FALSE(postSyncData.getSystemMemoryFenceRequest()); EXPECT_FALSE(postSyncData.getSystemMemoryFenceRequest());
} }
} }

View File

@ -128,39 +128,3 @@ PVCTEST_F(CommandEncodeStatesTestPvc, GivenVariousSlmTotalSizesWhenSetPreferredS
verifyPreferredSlmValues<FamilyType>(valuesToTest, pDevice->getRootDeviceEnvironment()); verifyPreferredSlmValues<FamilyType>(valuesToTest, pDevice->getRootDeviceEnvironment());
} }
PVCTEST_F(EncodeKernelPvcTest, givenDefaultSettingForFenceAsPostSyncOperationInComputeWalkerWhenEnqueueKernelIsCalledThenDoNotGenerateFenceCommands) {
using DefaultWalkerType = typename FamilyType::DefaultWalkerType;
using MI_MEM_FENCE = typename FamilyType::MI_MEM_FENCE;
DebugManagerStateRestore restore;
debugManager.flags.ProgramGlobalFenceAsPostSyncOperationInComputeWalker.set(-1);
auto &hwInfo = *pDevice->getRootDeviceEnvironment().getMutableHardwareInfo();
auto &productHelper = pDevice->getProductHelper();
hwInfo.platform.usDeviceID = pvcXlDeviceIds[0];
VariableBackup<unsigned short> hwRevId{&hwInfo.platform.usRevId};
hwRevId = productHelper.getHwRevIdFromStepping(REVISION_A0, hwInfo);
uint32_t dims[] = {1, 1, 1};
std::unique_ptr<MockDispatchKernelEncoder> dispatchInterface(new MockDispatchKernelEncoder());
dispatchInterface->getCrossThreadDataSizeResult = 0u;
bool requiresUncachedMocs = false;
EncodeDispatchKernelArgs dispatchArgs = createDefaultDispatchKernelArgs(pDevice, dispatchInterface.get(), dims, requiresUncachedMocs);
dispatchArgs.isKernelUsingSystemAllocation = true;
dispatchArgs.isHostScopeSignalEvent = true;
EncodeDispatchKernel<FamilyType>::template encode<DefaultWalkerType>(*cmdContainer.get(), dispatchArgs);
GenCmdList commands;
CmdParse<FamilyType>::parseCommandBuffer(commands, ptrOffset(cmdContainer->getCommandStream()->getCpuBase(), 0), cmdContainer->getCommandStream()->getUsed());
auto itor = find<DefaultWalkerType *>(commands.begin(), commands.end());
ASSERT_NE(itor, commands.end());
auto walkerCmd = genCmdCast<DefaultWalkerType *>(*itor);
auto &postSyncData = walkerCmd->getPostSync();
EXPECT_FALSE(postSyncData.getSystemMemoryFenceRequest());
}