From 32fd00e150e6221c036a8ce5f2bc7289d87b7c04 Mon Sep 17 00:00:00 2001 From: Zbigniew Zdanowicz Date: Thu, 31 Oct 2024 09:06:05 +0000 Subject: [PATCH] refactor: change additional walker fields encoder 4/n - move post sync system fence into dedicated encoder Related-To: NEO-12639 Signed-off-by: Zbigniew Zdanowicz --- .../hardware_interface_xehp_and_later.inl | 1 + .../command_container/command_encoder.h | 2 ++ .../command_encoder_bdw_and_later.inl | 13 +++++++ .../command_encoder_enablers.inl | 1 + .../command_encoder_xehp_and_later.inl | 13 +++++++ .../command_encoder_xe2_hpg_core.cpp | 8 ----- .../command_encoder_xe_hpc_core.cpp | 11 ------ .../command_encoder_xe_hpg_core.cpp | 7 ++-- .../pvc/dispatch_walker_tests_pvc.cpp | 6 ++-- .../pvc/test_encode_dispatch_kernel_pvc.cpp | 36 ------------------- 10 files changed, 38 insertions(+), 60 deletions(-) diff --git a/opencl/source/command_queue/hardware_interface_xehp_and_later.inl b/opencl/source/command_queue/hardware_interface_xehp_and_later.inl index 3426340a53..2898b18937 100644 --- a/opencl/source/command_queue/hardware_interface_xehp_and_later.inl +++ b/opencl/source/command_queue/hardware_interface_xehp_and_later.inl @@ -148,6 +148,7 @@ inline void HardwareInterface::programWalker( EncodeWalkerArgs encodeWalkerArgs{kernel.getExecutionType(), requiredSystemFence, kernelInfo.kernelDescriptor, kernelAttributes.walkOrder, kernelAttributes.additionalSize, maxFrontEndThreads}; EncodeDispatchKernel::template encodeAdditionalWalkerFields(rootDeviceEnvironment, walkerCmd, encodeWalkerArgs); + EncodeDispatchKernel::template encodeWalkerPostSyncFields(walkerCmd, encodeWalkerArgs); EncodeDispatchKernel::template overrideDefaultValues(walkerCmd, *interfaceDescriptor); auto devices = queueCsr.getOsContext().getDeviceBitfield(); diff --git a/shared/source/command_container/command_encoder.h b/shared/source/command_container/command_encoder.h index 182435de24..cd048437d0 100644 --- a/shared/source/command_container/command_encoder.h +++ b/shared/source/command_container/command_encoder.h @@ -229,6 +229,8 @@ struct EncodeDispatchKernel { template static void overrideDefaultValues(WalkerType &walkerCmd, InterfaceDescriptorType &interfaceDescriptor); + template + static void encodeWalkerPostSyncFields(WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs); }; template diff --git a/shared/source/command_container/command_encoder_bdw_and_later.inl b/shared/source/command_container/command_encoder_bdw_and_later.inl index 0c1796296d..2849ea7131 100644 --- a/shared/source/command_container/command_encoder_bdw_and_later.inl +++ b/shared/source/command_container/command_encoder_bdw_and_later.inl @@ -278,6 +278,15 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis auto threadGroupCount = cmd.getThreadGroupIdXDimension() * cmd.getThreadGroupIdYDimension() * cmd.getThreadGroupIdZDimension(); EncodeDispatchKernel::encodeThreadGroupDispatch(idd, *args.device, hwInfo, threadGroupDims, threadGroupCount, kernelDescriptor.kernelAttributes.numGrfRequired, numThreadsPerThreadGroup, cmd); + EncodeWalkerArgs walkerArgs{ + KernelExecutionType::defaultType, + args.requiresSystemMemoryFence(), + kernelDescriptor, + args.requiredDispatchWalkOrder, + args.additionalSizeParam, + args.device->getDeviceInfo().maxFrontEndThreads}; + EncodeDispatchKernel::encodeAdditionalWalkerFields(rootDeviceEnvironment, cmd, walkerArgs); + EncodeDispatchKernel::encodeWalkerPostSyncFields(cmd, walkerArgs); memcpy_s(iddPtr, sizeof(idd), &idd, sizeof(idd)); @@ -404,6 +413,10 @@ template template inline void EncodeDispatchKernel::encodeAdditionalWalkerFields(const RootDeviceEnvironment &rootDeviceEnvironment, WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs) {} +template +template +inline void NEO::EncodeDispatchKernel::encodeWalkerPostSyncFields(WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs) {} + template template void EncodeDispatchKernel::setupPreferredSlmSize(InterfaceDescriptorType *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy) {} diff --git a/shared/source/command_container/command_encoder_enablers.inl b/shared/source/command_container/command_encoder_enablers.inl index b0df4c5baa..98fb4579e9 100644 --- a/shared/source/command_container/command_encoder_enablers.inl +++ b/shared/source/command_container/command_encoder_enablers.inl @@ -32,6 +32,7 @@ template void NEO::EncodeDispatchKernel::forceComputeWalkerPostSyncFlush template void NEO::EncodeDispatchKernel::setWalkerRegionSettings(Family::DefaultWalkerType &walkerCmd, const HardwareInfo &hwInfo, uint32_t partitionCount, uint32_t workgroupSize, uint32_t maxWgCountPerTile, bool requiredWalkOrder); template void NEO::EncodeDispatchKernel::overrideDefaultValues(Family::DefaultWalkerType &walkerCmd, Family::INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor); +template void NEO::EncodeDispatchKernel::encodeWalkerPostSyncFields(Family::DefaultWalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs); template struct NEO::EncodeStates; template struct NEO::EncodeMath; diff --git a/shared/source/command_container/command_encoder_xehp_and_later.inl b/shared/source/command_container/command_encoder_xehp_and_later.inl index 6e576915d4..d22cfc6245 100644 --- a/shared/source/command_container/command_encoder_xehp_and_later.inl +++ b/shared/source/command_container/command_encoder_xehp_and_later.inl @@ -412,6 +412,7 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis args.additionalSizeParam, args.device->getDeviceInfo().maxFrontEndThreads}; EncodeDispatchKernel::encodeAdditionalWalkerFields(rootDeviceEnvironment, walkerCmd, walkerArgs); + EncodeDispatchKernel::encodeWalkerPostSyncFields(walkerCmd, walkerArgs); EncodeDispatchKernel::overrideDefaultValues(walkerCmd, idd); @@ -1235,4 +1236,16 @@ void EncodeDispatchKernel::encodeThreadGroupDispatch(InterfaceDescriptor } } +template +template +void EncodeDispatchKernel::encodeWalkerPostSyncFields(WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs) { + auto programGlobalFenceAsPostSyncOperationInComputeWalker = walkerArgs.requiredSystemFence; + int32_t overrideProgramSystemMemoryFence = debugManager.flags.ProgramGlobalFenceAsPostSyncOperationInComputeWalker.get(); + if (overrideProgramSystemMemoryFence != -1) { + programGlobalFenceAsPostSyncOperationInComputeWalker = !!overrideProgramSystemMemoryFence; + } + auto &postSyncData = walkerCmd.getPostSync(); + postSyncData.setSystemMemoryFenceRequest(programGlobalFenceAsPostSyncOperationInComputeWalker); +} + } // namespace NEO diff --git a/shared/source/xe2_hpg_core/command_encoder_xe2_hpg_core.cpp b/shared/source/xe2_hpg_core/command_encoder_xe2_hpg_core.cpp index d5e89df87a..34e7d91392 100644 --- a/shared/source/xe2_hpg_core/command_encoder_xe2_hpg_core.cpp +++ b/shared/source/xe2_hpg_core/command_encoder_xe2_hpg_core.cpp @@ -232,14 +232,6 @@ void EncodeSurfaceState::disableCompressionFlags(R_SURFACE_STATE *surfac template <> template void EncodeDispatchKernel::encodeAdditionalWalkerFields(const RootDeviceEnvironment &rootDeviceEnvironment, WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs) { - auto programGlobalFenceAsPostSyncOperationInComputeWalker = walkerArgs.requiredSystemFence; - int32_t overrideProgramSystemMemoryFence = debugManager.flags.ProgramGlobalFenceAsPostSyncOperationInComputeWalker.get(); - if (overrideProgramSystemMemoryFence != -1) { - programGlobalFenceAsPostSyncOperationInComputeWalker = !!overrideProgramSystemMemoryFence; - } - auto &postSyncData = walkerCmd.getPostSync(); - postSyncData.setSystemMemoryFenceRequest(programGlobalFenceAsPostSyncOperationInComputeWalker); - bool computeDispatchAllWalkerEnable = walkerArgs.kernelExecutionType == KernelExecutionType::concurrent; int32_t overrideComputeDispatchAllWalkerEnable = debugManager.flags.ComputeDispatchAllWalkerEnableInComputeWalker.get(); if (overrideComputeDispatchAllWalkerEnable != -1) { diff --git a/shared/source/xe_hpc_core/command_encoder_xe_hpc_core.cpp b/shared/source/xe_hpc_core/command_encoder_xe_hpc_core.cpp index c92ca1069a..7ba30f3923 100644 --- a/shared/source/xe_hpc_core/command_encoder_xe_hpc_core.cpp +++ b/shared/source/xe_hpc_core/command_encoder_xe_hpc_core.cpp @@ -163,17 +163,6 @@ void EncodeDispatchKernel::programBarrierEnable(INTERFACE_DESCRIPTOR_DAT template <> template void EncodeDispatchKernel::encodeAdditionalWalkerFields(const RootDeviceEnvironment &rootDeviceEnvironment, WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs) { - const auto &productHelper = rootDeviceEnvironment.getHelper(); - auto &hwInfo = *rootDeviceEnvironment.getHardwareInfo(); - auto programGlobalFenceAsPostSyncOperationInComputeWalker = productHelper.isGlobalFenceInCommandStreamRequired(hwInfo) && - walkerArgs.requiredSystemFence; - int32_t overrideProgramSystemMemoryFence = debugManager.flags.ProgramGlobalFenceAsPostSyncOperationInComputeWalker.get(); - if (overrideProgramSystemMemoryFence != -1) { - programGlobalFenceAsPostSyncOperationInComputeWalker = !!overrideProgramSystemMemoryFence; - } - auto &postSyncData = walkerCmd.getPostSync(); - postSyncData.setSystemMemoryFenceRequest(programGlobalFenceAsPostSyncOperationInComputeWalker); - int32_t overrideDispatchAllWalkerEnableInComputeWalker = debugManager.flags.ComputeDispatchAllWalkerEnableInComputeWalker.get(); if (overrideDispatchAllWalkerEnableInComputeWalker != -1) { walkerCmd.setComputeDispatchAllWalkerEnable(overrideDispatchAllWalkerEnableInComputeWalker); diff --git a/shared/source/xe_hpg_core/command_encoder_xe_hpg_core.cpp b/shared/source/xe_hpg_core/command_encoder_xe_hpg_core.cpp index a80d24e5b5..7c90f78dc0 100644 --- a/shared/source/xe_hpg_core/command_encoder_xe_hpg_core.cpp +++ b/shared/source/xe_hpg_core/command_encoder_xe_hpg_core.cpp @@ -58,8 +58,11 @@ void EncodeDispatchKernel::programBarrierEnable(INTERFACE_DESCRIPTOR_DAT template <> template -void EncodeDispatchKernel::encodeAdditionalWalkerFields(const RootDeviceEnvironment &rootDeviceEnvironment, WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs) { -} +void EncodeDispatchKernel::encodeAdditionalWalkerFields(const RootDeviceEnvironment &rootDeviceEnvironment, WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs) {} + +template <> +template +void EncodeDispatchKernel::encodeWalkerPostSyncFields(WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs) {} template <> void EncodeComputeMode::programComputeModeCommand(LinearStream &csr, StateComputeModeProperties &properties, const RootDeviceEnvironment &rootDeviceEnvironment) { diff --git a/shared/test/unit_test/xe_hpc_core/pvc/dispatch_walker_tests_pvc.cpp b/shared/test/unit_test/xe_hpc_core/pvc/dispatch_walker_tests_pvc.cpp index 02d06439e2..fe8dec7345 100644 --- a/shared/test/unit_test/xe_hpc_core/pvc/dispatch_walker_tests_pvc.cpp +++ b/shared/test/unit_test/xe_hpc_core/pvc/dispatch_walker_tests_pvc.cpp @@ -27,7 +27,7 @@ PVCTEST_F(WalkerDispatchTestsPvc, givenPvcWhenEncodeAdditionalWalkerFieldsThenPo int32_t programGlobalFenceAsPostSyncOperationInComputeWalker; bool expectSystemMemoryFenceRequest; } testInputs[] = { - {0x0, -1, false}, + {0x0, -1, true}, {0x3, -1, true}, {0x0, 0, false}, {0x3, 0, false}, @@ -53,7 +53,7 @@ PVCTEST_F(WalkerDispatchTestsPvc, givenPvcWhenEncodeAdditionalWalkerFieldsThenPo testInput.programGlobalFenceAsPostSyncOperationInComputeWalker); postSyncData.setSystemMemoryFenceRequest(false); - EncodeDispatchKernel::encodeAdditionalWalkerFields(rootDeviceEnvironment, walkerCmd, walkerArgs); + EncodeDispatchKernel::encodeWalkerPostSyncFields(walkerCmd, walkerArgs); EXPECT_EQ(testInput.expectSystemMemoryFenceRequest, postSyncData.getSystemMemoryFenceRequest()); } } @@ -75,7 +75,7 @@ PVCTEST_F(WalkerDispatchTestsPvc, givenPvcSupportsSystemMemoryFenceWhenNoSystemF hwInfo.platform.usDeviceID = deviceId; postSyncData.setSystemMemoryFenceRequest(true); - EncodeDispatchKernel::encodeAdditionalWalkerFields(rootDeviceEnvironment, walkerCmd, walkerArgs); + EncodeDispatchKernel::encodeWalkerPostSyncFields(walkerCmd, walkerArgs); EXPECT_FALSE(postSyncData.getSystemMemoryFenceRequest()); } } diff --git a/shared/test/unit_test/xe_hpc_core/pvc/test_encode_dispatch_kernel_pvc.cpp b/shared/test/unit_test/xe_hpc_core/pvc/test_encode_dispatch_kernel_pvc.cpp index 774050cca5..ec5c14a974 100644 --- a/shared/test/unit_test/xe_hpc_core/pvc/test_encode_dispatch_kernel_pvc.cpp +++ b/shared/test/unit_test/xe_hpc_core/pvc/test_encode_dispatch_kernel_pvc.cpp @@ -128,39 +128,3 @@ PVCTEST_F(CommandEncodeStatesTestPvc, GivenVariousSlmTotalSizesWhenSetPreferredS verifyPreferredSlmValues(valuesToTest, pDevice->getRootDeviceEnvironment()); } - -PVCTEST_F(EncodeKernelPvcTest, givenDefaultSettingForFenceAsPostSyncOperationInComputeWalkerWhenEnqueueKernelIsCalledThenDoNotGenerateFenceCommands) { - using DefaultWalkerType = typename FamilyType::DefaultWalkerType; - using MI_MEM_FENCE = typename FamilyType::MI_MEM_FENCE; - - DebugManagerStateRestore restore; - debugManager.flags.ProgramGlobalFenceAsPostSyncOperationInComputeWalker.set(-1); - - auto &hwInfo = *pDevice->getRootDeviceEnvironment().getMutableHardwareInfo(); - auto &productHelper = pDevice->getProductHelper(); - - hwInfo.platform.usDeviceID = pvcXlDeviceIds[0]; - VariableBackup hwRevId{&hwInfo.platform.usRevId}; - hwRevId = productHelper.getHwRevIdFromStepping(REVISION_A0, hwInfo); - - uint32_t dims[] = {1, 1, 1}; - std::unique_ptr dispatchInterface(new MockDispatchKernelEncoder()); - dispatchInterface->getCrossThreadDataSizeResult = 0u; - - bool requiresUncachedMocs = false; - EncodeDispatchKernelArgs dispatchArgs = createDefaultDispatchKernelArgs(pDevice, dispatchInterface.get(), dims, requiresUncachedMocs); - dispatchArgs.isKernelUsingSystemAllocation = true; - dispatchArgs.isHostScopeSignalEvent = true; - - EncodeDispatchKernel::template encode(*cmdContainer.get(), dispatchArgs); - - GenCmdList commands; - CmdParse::parseCommandBuffer(commands, ptrOffset(cmdContainer->getCommandStream()->getCpuBase(), 0), cmdContainer->getCommandStream()->getUsed()); - - auto itor = find(commands.begin(), commands.end()); - ASSERT_NE(itor, commands.end()); - - auto walkerCmd = genCmdCast(*itor); - auto &postSyncData = walkerCmd->getPostSync(); - EXPECT_FALSE(postSyncData.getSystemMemoryFenceRequest()); -}