From 58323bfb12a9f6abbdc21fed26d4a363e9c3dbe9 Mon Sep 17 00:00:00 2001 From: "Dunajski, Bartosz" Date: Sat, 9 Dec 2023 19:26:30 +0000 Subject: [PATCH] refactor: improve PostSync helper methods Related-To: NEO-8210 Signed-off-by: Dunajski, Bartosz --- .../command_queue/gpgpu_walker_bdw_and_later.inl | 2 -- .../command_queue/gpgpu_walker_xehp_and_later.inl | 4 +--- shared/source/command_container/command_encoder.h | 2 +- shared/source/command_container/command_encoder.inl | 2 +- .../command_encoder_xehp_and_later.inl | 6 ++++-- shared/source/gen11/command_encoder_gen11.cpp | 2 +- shared/source/gen12lp/command_encoder_gen12lp.cpp | 2 +- shared/source/gen8/command_encoder_gen8.cpp | 2 +- shared/source/gen9/command_encoder_gen9.cpp | 2 +- .../source/xe_hpc_core/command_encoder_xe_hpc_core.cpp | 8 +------- .../source/xe_hpg_core/command_encoder_xe_hpg_core.cpp | 10 +--------- 11 files changed, 13 insertions(+), 29 deletions(-) diff --git a/opencl/source/command_queue/gpgpu_walker_bdw_and_later.inl b/opencl/source/command_queue/gpgpu_walker_bdw_and_later.inl index a07525e8c8..7ca65954fc 100644 --- a/opencl/source/command_queue/gpgpu_walker_bdw_and_later.inl +++ b/opencl/source/command_queue/gpgpu_walker_bdw_and_later.inl @@ -75,8 +75,6 @@ void GpgpuWalkerHelper::setupTimestampPacket( 0, rootDeviceEnvironment, args); - - EncodeDispatchKernel::adjustTimestampPacket(*walkerCmd, *rootDeviceEnvironment.getHardwareInfo()); } template diff --git a/opencl/source/command_queue/gpgpu_walker_xehp_and_later.inl b/opencl/source/command_queue/gpgpu_walker_xehp_and_later.inl index ffe4a301d4..fa5b0f6b98 100644 --- a/opencl/source/command_queue/gpgpu_walker_xehp_and_later.inl +++ b/opencl/source/command_queue/gpgpu_walker_xehp_and_later.inl @@ -104,15 +104,13 @@ void GpgpuWalkerHelper::setupTimestampPacket(LinearStream *cmdStream, const RootDeviceEnvironment &rootDeviceEnvironment) { using POSTSYNC_DATA = std::remove_reference_t>; - const auto &hwInfo = *rootDeviceEnvironment.getHardwareInfo(); auto &postSyncData = walkerCmd->getPostSync(); postSyncData.setDataportPipelineFlush(true); + postSyncData.setDataportSubsliceCacheFlush(true); EncodeDispatchKernel::template setupPostSyncMocs(*walkerCmd, rootDeviceEnvironment, MemorySynchronizationCommands::getDcFlushEnable(true, rootDeviceEnvironment)); - EncodeDispatchKernel::template adjustTimestampPacket(*walkerCmd, hwInfo); - if (debugManager.flags.UseImmDataWriteModeOnPostSyncOperation.get()) { postSyncData.setOperation(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_IMMEDIATE_DATA); auto contextEndAddress = TimestampPacketHelper::getContextEndGpuAddress(*timestampPacketNode); diff --git a/shared/source/command_container/command_encoder.h b/shared/source/command_container/command_encoder.h index b3c67e6483..1bd88a25b7 100644 --- a/shared/source/command_container/command_encoder.h +++ b/shared/source/command_container/command_encoder.h @@ -153,7 +153,7 @@ struct EncodeDispatchKernel { static void adjustBindingTablePrefetch(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t samplerCount, uint32_t bindingTableEntryCount); template - static void adjustTimestampPacket(WalkerType &walkerCmd, const HardwareInfo &hwInfo); + static void adjustTimestampPacket(WalkerType &walkerCmd, const EncodeDispatchKernelArgs &args); template static void setupPostSyncMocs(WalkerType &walkerCmd, const RootDeviceEnvironment &rootDeviceEnvironment, bool dcFlush); diff --git a/shared/source/command_container/command_encoder.inl b/shared/source/command_container/command_encoder.inl index 0039d2ef83..212306c4eb 100644 --- a/shared/source/command_container/command_encoder.inl +++ b/shared/source/command_container/command_encoder.inl @@ -581,7 +581,7 @@ bool EncodeDispatchKernel::inlineDataProgrammingRequired(const KernelDes template template -void EncodeDispatchKernel::adjustTimestampPacket(WalkerType &walkerCmd, const HardwareInfo &hwInfo) {} +void EncodeDispatchKernel::adjustTimestampPacket(WalkerType &walkerCmd, const EncodeDispatchKernelArgs &args) {} template void EncodeIndirectParams::encode(CommandContainer &container, uint64_t crossThreadDataGpuVa, DispatchKernelEncoderI *dispatchInterface, uint64_t implicitArgsGpuPtr) { diff --git a/shared/source/command_container/command_encoder_xehp_and_later.inl b/shared/source/command_container/command_encoder_xehp_and_later.inl index 87306f0f84..3601434419 100644 --- a/shared/source/command_container/command_encoder_xehp_and_later.inl +++ b/shared/source/command_container/command_encoder_xehp_and_later.inl @@ -347,6 +347,8 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis auto &postSync = walkerCmd.getPostSync(); if (args.eventAddress != 0) { postSync.setDataportPipelineFlush(true); + postSync.setDataportSubsliceCacheFlush(true); + if (args.isTimestampEvent) { UNRECOVERABLE_IF(!(isAligned(args.eventAddress))); @@ -360,12 +362,12 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis postSync.setDestinationAddress(args.eventAddress); EncodeDispatchKernel::setupPostSyncMocs(walkerCmd, rootDeviceEnvironment, args.dcFlushEnable); - EncodeDispatchKernel::adjustTimestampPacket(walkerCmd, hwInfo); + EncodeDispatchKernel::adjustTimestampPacket(walkerCmd, args); } if (debugManager.flags.ForceComputeWalkerPostSyncFlush.get() == 1) { postSync.setDataportPipelineFlush(true); - EncodeDispatchKernel::adjustTimestampPacket(walkerCmd, hwInfo); + postSync.setDataportSubsliceCacheFlush(true); } walkerCmd.setPredicateEnable(args.isPredicate); diff --git a/shared/source/gen11/command_encoder_gen11.cpp b/shared/source/gen11/command_encoder_gen11.cpp index d948025614..13b4eef742 100644 --- a/shared/source/gen11/command_encoder_gen11.cpp +++ b/shared/source/gen11/command_encoder_gen11.cpp @@ -73,7 +73,7 @@ void EncodeComputeMode::programComputeModeCommand(LinearStream &csr, Sta template struct EncodeDispatchKernel; template void EncodeDispatchKernel::encodeAdditionalWalkerFields(const RootDeviceEnvironment &rootDeviceEnvironment, Family::DefaultWalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs); -template void EncodeDispatchKernel::adjustTimestampPacket(Family::DefaultWalkerType &walkerCmd, const HardwareInfo &hwInfo); +template void EncodeDispatchKernel::adjustTimestampPacket(Family::DefaultWalkerType &walkerCmd, const EncodeDispatchKernelArgs &args); template void EncodeDispatchKernel::setGrfInfo(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, uint32_t numGrf, const size_t &sizeCrossThreadData, const size_t &sizePerThreadData, const RootDeviceEnvironment &rootDeviceEnvironment); template void EncodeDispatchKernel::appendAdditionalIDDFields(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy); template void EncodeDispatchKernel::programBarrierEnable(Family::INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t value, const HardwareInfo &hwInfo); diff --git a/shared/source/gen12lp/command_encoder_gen12lp.cpp b/shared/source/gen12lp/command_encoder_gen12lp.cpp index 40293e25a0..afcbd46d41 100644 --- a/shared/source/gen12lp/command_encoder_gen12lp.cpp +++ b/shared/source/gen12lp/command_encoder_gen12lp.cpp @@ -113,7 +113,7 @@ void EncodeComputeMode::adjustPipelineSelect(CommandContainer &container template struct EncodeDispatchKernel; template void EncodeDispatchKernel::encodeAdditionalWalkerFields(const RootDeviceEnvironment &rootDeviceEnvironment, Family::DefaultWalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs); -template void EncodeDispatchKernel::adjustTimestampPacket(Family::DefaultWalkerType &walkerCmd, const HardwareInfo &hwInfo); +template void EncodeDispatchKernel::adjustTimestampPacket(Family::DefaultWalkerType &walkerCmd, const EncodeDispatchKernelArgs &args); template void EncodeDispatchKernel::setGrfInfo(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, uint32_t numGrf, const size_t &sizeCrossThreadData, const size_t &sizePerThreadData, const RootDeviceEnvironment &rootDeviceEnvironment); template void EncodeDispatchKernel::appendAdditionalIDDFields(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy); template void EncodeDispatchKernel::programBarrierEnable(Family::INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t value, const HardwareInfo &hwInfo); diff --git a/shared/source/gen8/command_encoder_gen8.cpp b/shared/source/gen8/command_encoder_gen8.cpp index 5936e6d9e8..6a2225a1e6 100644 --- a/shared/source/gen8/command_encoder_gen8.cpp +++ b/shared/source/gen8/command_encoder_gen8.cpp @@ -57,7 +57,7 @@ void EncodeBatchBufferStartOrEnd::appendBatchBufferStart(MI_BATCH_BUFFER template struct EncodeDispatchKernel; template void EncodeDispatchKernel::encodeAdditionalWalkerFields(const RootDeviceEnvironment &rootDeviceEnvironment, Family::DefaultWalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs); -template void EncodeDispatchKernel::adjustTimestampPacket(Family::DefaultWalkerType &walkerCmd, const HardwareInfo &hwInfo); +template void EncodeDispatchKernel::adjustTimestampPacket(Family::DefaultWalkerType &walkerCmd, const EncodeDispatchKernelArgs &args); template void EncodeDispatchKernel::setGrfInfo(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, uint32_t numGrf, const size_t &sizeCrossThreadData, const size_t &sizePerThreadData, const RootDeviceEnvironment &rootDeviceEnvironment); template void EncodeDispatchKernel::appendAdditionalIDDFields(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy); template void EncodeDispatchKernel::programBarrierEnable(Family::INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t value, const HardwareInfo &hwInfo); diff --git a/shared/source/gen9/command_encoder_gen9.cpp b/shared/source/gen9/command_encoder_gen9.cpp index 5e3d648f92..e73e190004 100644 --- a/shared/source/gen9/command_encoder_gen9.cpp +++ b/shared/source/gen9/command_encoder_gen9.cpp @@ -58,7 +58,7 @@ void EncodeComputeMode::programComputeModeCommand(LinearStream &csr, Sta template struct EncodeDispatchKernel; template void EncodeDispatchKernel::encodeAdditionalWalkerFields(const RootDeviceEnvironment &rootDeviceEnvironment, Family::DefaultWalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs); -template void EncodeDispatchKernel::adjustTimestampPacket(Family::DefaultWalkerType &walkerCmd, const HardwareInfo &hwInfo); +template void EncodeDispatchKernel::adjustTimestampPacket(Family::DefaultWalkerType &walkerCmd, const EncodeDispatchKernelArgs &args); template void EncodeDispatchKernel::setGrfInfo(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, uint32_t numGrf, const size_t &sizeCrossThreadData, const size_t &sizePerThreadData, const RootDeviceEnvironment &rootDeviceEnvironment); template void EncodeDispatchKernel::appendAdditionalIDDFields(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy); template void EncodeDispatchKernel::programBarrierEnable(Family::INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t value, const HardwareInfo &hwInfo); diff --git a/shared/source/xe_hpc_core/command_encoder_xe_hpc_core.cpp b/shared/source/xe_hpc_core/command_encoder_xe_hpc_core.cpp index 3b1460e275..56eebe302a 100644 --- a/shared/source/xe_hpc_core/command_encoder_xe_hpc_core.cpp +++ b/shared/source/xe_hpc_core/command_encoder_xe_hpc_core.cpp @@ -26,12 +26,6 @@ using Family = NEO::XeHpcCoreFamily; namespace NEO { -template <> -template -void EncodeDispatchKernel::adjustTimestampPacket(WalkerType &walkerCmd, const HardwareInfo &hwInfo) { - walkerCmd.getPostSync().setDataportSubsliceCacheFlush(true); -} - template <> template void EncodeDispatchKernel::adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WalkerType &walkerCmd) { @@ -374,7 +368,7 @@ void EncodeDispatchKernel::adjustBindingTablePrefetch(INTERFACE_DESCRIPT template struct EncodeDispatchKernel; template void EncodeDispatchKernel::encodeAdditionalWalkerFields(const RootDeviceEnvironment &rootDeviceEnvironment, Family::DefaultWalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs); -template void EncodeDispatchKernel::adjustTimestampPacket(Family::DefaultWalkerType &walkerCmd, const HardwareInfo &hwInfo); +template void EncodeDispatchKernel::adjustTimestampPacket(Family::DefaultWalkerType &walkerCmd, const EncodeDispatchKernelArgs &args); template void EncodeDispatchKernel::setGrfInfo(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, uint32_t numGrf, const size_t &sizeCrossThreadData, const size_t &sizePerThreadData, const RootDeviceEnvironment &rootDeviceEnvironment); template void EncodeDispatchKernel::appendAdditionalIDDFields(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy); template void EncodeDispatchKernel::adjustInterfaceDescriptorData(Family::INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, Family::DefaultWalkerType &walkerCmd); diff --git a/shared/source/xe_hpg_core/command_encoder_xe_hpg_core.cpp b/shared/source/xe_hpg_core/command_encoder_xe_hpg_core.cpp index 653008b54a..44ffe78c12 100644 --- a/shared/source/xe_hpg_core/command_encoder_xe_hpg_core.cpp +++ b/shared/source/xe_hpg_core/command_encoder_xe_hpg_core.cpp @@ -25,14 +25,6 @@ using Family = NEO::XeHpgCoreFamily; namespace NEO { -template <> -template -void EncodeDispatchKernel::adjustTimestampPacket(WalkerType &walkerCmd, const HardwareInfo &hwInfo) { - auto &postSyncData = walkerCmd.getPostSync(); - - postSyncData.setDataportSubsliceCacheFlush(true); -} - template <> template void EncodeDispatchKernel::appendAdditionalIDDFields(InterfaceDescriptorType *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy) { @@ -229,7 +221,7 @@ template void flushGpuCache(LinearStream *commandStream, const Range; template void EncodeDispatchKernel::encodeAdditionalWalkerFields(const RootDeviceEnvironment &rootDeviceEnvironment, Family::DefaultWalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs); -template void EncodeDispatchKernel::adjustTimestampPacket(Family::DefaultWalkerType &walkerCmd, const HardwareInfo &hwInfo); +template void EncodeDispatchKernel::adjustTimestampPacket(Family::DefaultWalkerType &walkerCmd, const EncodeDispatchKernelArgs &args); template void EncodeDispatchKernel::setGrfInfo(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, uint32_t numGrf, const size_t &sizeCrossThreadData, const size_t &sizePerThreadData, const RootDeviceEnvironment &rootDeviceEnvironment); template void EncodeDispatchKernel::appendAdditionalIDDFields(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy); template void EncodeDispatchKernel::adjustInterfaceDescriptorData(Family::INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, Family::DefaultWalkerType &walkerCmd);