Revert "refactor: add EncodePostSyncArgs to EncodeDispatchKernelArgs"

This reverts commit 40aef1555e.

Related-To: NEO-13003
Signed-off-by: Young Jin Yoon <young.jin.yoon@intel.com>
This commit is contained in:
Young Jin Yoon
2025-04-17 12:14:25 +00:00
committed by Compute-Runtime-Automation
parent 5ed6890e74
commit 5a2a792c34
18 changed files with 172 additions and 193 deletions

View File

@@ -49,7 +49,7 @@ struct StateComputeModeProperties;
struct ImplicitArgs;
struct EncodeKernelArgsExt;
struct EncodePostSyncArgs {
struct EncodeDispatchKernelArgs {
uint64_t eventAddress = 0;
uint64_t postSyncImmValue = 0;
uint64_t inOrderCounterValue = 0;
@@ -57,53 +57,6 @@ struct EncodePostSyncArgs {
uint64_t inOrderIncrementValue = 0;
Device *device = nullptr;
NEO::InOrderExecInfo *inOrderExecInfo = nullptr;
bool isTimestampEvent = false;
bool isHostScopeSignalEvent = false;
bool isKernelUsingSystemAllocation = false;
bool dcFlushEnable = false;
bool interruptEvent = false;
bool isFlushL3ForExternalAllocationRequired = false;
bool isFlushL3ForHostUsmRequired = false;
bool requiresSystemMemoryFence() const {
return (isHostScopeSignalEvent && isKernelUsingSystemAllocation && this->device->getProductHelper().isGlobalFenceInPostSyncRequired(this->device->getHardwareInfo()));
}
};
template <typename GfxFamily>
struct EncodePostSync {
static constexpr size_t timestampDestinationAddressAlignment = 16;
static constexpr size_t immWriteDestinationAddressAlignment = 8;
template <typename CommandType>
static void encodeL3Flush(CommandType &cmd, const EncodePostSyncArgs &args);
template <typename CommandType>
static void setupPostSyncForRegularEvent(CommandType &cmd, const EncodePostSyncArgs &args);
template <typename CommandType>
static void setupPostSyncForInOrderExec(CommandType &cmd, const EncodePostSyncArgs &args);
static uint32_t getPostSyncMocs(const RootDeviceEnvironment &rootDeviceEnvironment, const bool dcFlush);
template <typename CommandType>
static auto &getPostSync(CommandType &cmd, size_t index);
template <typename PostSyncT>
static void setPostSyncData(PostSyncT &postSyncData, const typename PostSyncT::OPERATION operation, const uint64_t gpuVa, const uint64_t immData, [[maybe_unused]] const uint32_t atomicOpcode, const uint32_t mocs, [[maybe_unused]] const bool interrupt, const bool requiresSystemMemoryFence);
template <typename PostSyncT>
static void setPostSyncDataCommon(PostSyncT &postSyncData, const typename PostSyncT::OPERATION operation, const uint64_t gpuVa, const uint64_t immData);
template <typename CommandType>
static void setCommandLevelInterrupt(CommandType &cmd, bool interrupt);
template <typename CommandType>
static void adjustTimestampPacket(CommandType &cmd, const EncodePostSyncArgs &args);
};
struct EncodeDispatchKernelArgs {
Device *device = nullptr;
DispatchKernelEncoderI *dispatchInterface = nullptr;
IndirectHeap *surfaceStateHeap = nullptr;
IndirectHeap *dynamicStateHeap = nullptr;
@@ -114,7 +67,6 @@ struct EncodeDispatchKernelArgs {
void *outImplicitArgsPtr = nullptr;
std::list<void *> *additionalCommands = nullptr;
EncodeKernelArgsExt *extendedArgs = nullptr;
NEO::EncodePostSyncArgs postSyncArgs{};
PreemptionMode preemptionMode = PreemptionMode::Initial;
NEO::RequiredPartitionDim requiredPartitionDim = NEO::RequiredPartitionDim::none;
NEO::RequiredDispatchWalkOrder requiredDispatchWalkOrder = NEO::RequiredDispatchWalkOrder::none;
@@ -125,15 +77,26 @@ struct EncodeDispatchKernelArgs {
int32_t defaultPipelinedThreadArbitrationPolicy = NEO::ThreadArbitrationPolicy::NotPresent;
bool isIndirect = false;
bool isPredicate = false;
bool isTimestampEvent = false;
bool requiresUncachedMocs = false;
bool isInternal = false;
bool isCooperative = false;
bool isHostScopeSignalEvent = false;
bool isKernelUsingSystemAllocation = false;
bool isKernelDispatchedFromImmediateCmdList = false;
bool isRcs = false;
bool dcFlushEnable = false;
bool isHeaplessModeEnabled = false;
bool isHeaplessStateInitEnabled = false;
bool interruptEvent = false;
bool immediateScratchAddressPatching = false;
bool makeCommandView = false;
bool isFlushL3AfterPostSyncForExternalAllocationRequired = false;
bool isFlushL3AfterPostSyncForHostUsmRequired = false;
bool requiresSystemMemoryFence() const {
return (isHostScopeSignalEvent && isKernelUsingSystemAllocation && this->device->getProductHelper().isGlobalFenceInPostSyncRequired(this->device->getHardwareInfo()));
}
};
struct EncodeStoreMMIOParams {
@@ -746,4 +709,58 @@ struct EnodeUserInterrupt {
static void encode(LinearStream &commandStream);
};
struct EncodePostSyncArgs {
uint64_t eventAddress = 0;
uint64_t postSyncImmValue = 0;
uint64_t inOrderCounterValue = 0;
uint64_t inOrderIncrementGpuAddress = 0;
uint64_t inOrderIncrementValue = 0;
Device *device = nullptr;
NEO::InOrderExecInfo *inOrderExecInfo = nullptr;
bool isTimestampEvent = false;
bool isHostScopeSignalEvent = false;
bool isKernelUsingSystemAllocation = false;
bool dcFlushEnable = false;
bool interruptEvent = false;
bool isFlushL3ForExternalAllocationRequired = false;
bool isFlushL3ForHostUsmRequired = false;
bool requiresSystemMemoryFence() const {
return (isHostScopeSignalEvent && isKernelUsingSystemAllocation && this->device->getProductHelper().isGlobalFenceInPostSyncRequired(this->device->getHardwareInfo()));
}
};
template <typename GfxFamily>
struct EncodePostSync {
static constexpr size_t timestampDestinationAddressAlignment = 16;
static constexpr size_t immWriteDestinationAddressAlignment = 8;
static EncodePostSyncArgs createPostSyncArgs(const EncodeDispatchKernelArgs &args);
template <typename CommandType>
static void encodeL3Flush(CommandType &cmd, const EncodePostSyncArgs &args);
template <typename CommandType>
static void setupPostSyncForRegularEvent(CommandType &cmd, const EncodePostSyncArgs &args);
template <typename CommandType>
static void setupPostSyncForInOrderExec(CommandType &cmd, const EncodePostSyncArgs &args);
static uint32_t getPostSyncMocs(const RootDeviceEnvironment &rootDeviceEnvironment, const bool dcFlush);
template <typename CommandType>
static auto &getPostSync(CommandType &cmd, size_t index);
template <typename PostSyncT>
static void setPostSyncData(PostSyncT &postSyncData, const typename PostSyncT::OPERATION operation, const uint64_t gpuVa, const uint64_t immData, [[maybe_unused]] const uint32_t atomicOpcode, const uint32_t mocs, [[maybe_unused]] const bool interrupt, const bool requiresSystemMemoryFence);
template <typename PostSyncT>
static void setPostSyncDataCommon(PostSyncT &postSyncData, const typename PostSyncT::OPERATION operation, const uint64_t gpuVa, const uint64_t immData);
template <typename CommandType>
static void setCommandLevelInterrupt(CommandType &cmd, bool interrupt);
template <typename CommandType>
static void adjustTimestampPacket(CommandType &cmd, const EncodePostSyncArgs &args);
};
} // namespace NEO

View File

@@ -1173,4 +1173,23 @@ void EncodeComputeMode<Family>::adjustPipelineSelect(CommandContainer &container
container.getDevice()->getRootDeviceEnvironment());
}
template <typename Family>
EncodePostSyncArgs EncodePostSync<Family>::createPostSyncArgs(const EncodeDispatchKernelArgs &args) {
return EncodePostSyncArgs{
.eventAddress = args.eventAddress,
.postSyncImmValue = args.postSyncImmValue,
.inOrderCounterValue = args.inOrderCounterValue,
.inOrderIncrementGpuAddress = args.inOrderIncrementGpuAddress,
.inOrderIncrementValue = args.inOrderIncrementValue,
.device = args.device,
.inOrderExecInfo = args.inOrderExecInfo,
.isTimestampEvent = args.isTimestampEvent,
.isHostScopeSignalEvent = args.isHostScopeSignalEvent,
.isKernelUsingSystemAllocation = args.isKernelUsingSystemAllocation,
.dcFlushEnable = args.dcFlushEnable,
.interruptEvent = args.interruptEvent,
.isFlushL3ForExternalAllocationRequired = args.isFlushL3AfterPostSyncForExternalAllocationRequired,
.isFlushL3ForHostUsmRequired = args.isFlushL3AfterPostSyncForHostUsmRequired};
}
} // namespace NEO

View File

@@ -95,6 +95,10 @@ template <typename Family>
template <typename CommandType>
void EncodePostSync<Family>::encodeL3Flush(CommandType &cmd, const EncodePostSyncArgs &args) {}
template <typename Family>
template <typename CommandType>
void EncodePostSync<Family>::setCommandLevelInterrupt(CommandType &cmd, bool interrupt) {}
template <typename Family>
template <typename WalkerType>
void EncodeDispatchKernel<Family>::setWalkerRegionSettings(WalkerType &walkerCmd, const NEO::Device &device, uint32_t partitionCount, uint32_t workgroupSize, uint32_t threadGroupCount, uint32_t maxWgCountPerTile, bool requiredDispatchWalkOrder) {}

View File

@@ -304,7 +304,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
args.requiresUncachedMocs) {
PipeControlArgs syncArgs;
syncArgs.dcFlushEnable = args.postSyncArgs.dcFlushEnable;
syncArgs.dcFlushEnable = args.dcFlushEnable;
MemorySynchronizationCommands<Family>::addSingleBarrier(*container.getCommandStream(), syncArgs);
STATE_BASE_ADDRESS sbaCmd;
auto gmmHelper = container.getDevice()->getGmmHelper();
@@ -370,10 +370,11 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
requiredWorkgroupOrder,
rootDeviceEnvironment);
if (args.postSyncArgs.inOrderExecInfo) {
EncodePostSync<Family>::setupPostSyncForInOrderExec(walkerCmd, args.postSyncArgs);
} else if (args.postSyncArgs.eventAddress) {
EncodePostSync<Family>::setupPostSyncForRegularEvent(walkerCmd, args.postSyncArgs);
auto postSyncArgs = EncodePostSync<Family>::createPostSyncArgs(args);
if (args.inOrderExecInfo) {
EncodePostSync<Family>::setupPostSyncForInOrderExec(walkerCmd, postSyncArgs);
} else if (args.eventAddress) {
EncodePostSync<Family>::setupPostSyncForRegularEvent(walkerCmd, postSyncArgs);
} else {
EncodeDispatchKernel<Family>::forceComputeWalkerPostSyncFlushWithWrite(walkerCmd);
}
@@ -415,7 +416,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
.requiredDispatchWalkOrder = args.requiredDispatchWalkOrder,
.localRegionSize = args.localRegionSize,
.maxFrontEndThreads = args.device->getDeviceInfo().maxFrontEndThreads,
.requiredSystemFence = args.postSyncArgs.requiresSystemMemoryFence(),
.requiredSystemFence = args.requiresSystemMemoryFence(),
.hasSample = kernelDescriptor.kernelAttributes.flags.hasSample,
.l0DebuggerEnabled = args.device->getL0Debugger() != nullptr};
@@ -441,7 +442,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
args.maxWgCountPerTile, // maxWgCountPerTile
!(container.getFlushTaskUsedForImmediate() || container.isUsingPrimaryBuffer()), // useSecondaryBatchBuffer
!args.isKernelDispatchedFromImmediateCmdList, // apiSelfCleanup
args.postSyncArgs.dcFlushEnable, // dcFlush
args.dcFlushEnable, // dcFlush
EncodeDispatchKernel<Family>::singleTileExecImplicitScalingRequired(args.isCooperative), // forceExecutionOnSingleTile
args.makeCommandView, // blockDispatchToCommandBuffer
isRequiredDispatchWorkGroupOrder}; // isRequiredDispatchWorkGroupOrder

View File

@@ -230,7 +230,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
if (flush) {
PipeControlArgs syncArgs;
syncArgs.dcFlushEnable = args.postSyncArgs.dcFlushEnable;
syncArgs.dcFlushEnable = args.dcFlushEnable;
if (dirtyHeaps) {
syncArgs.hdcPipelineFlush = true;
}
@@ -298,7 +298,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
.requiredDispatchWalkOrder = args.requiredDispatchWalkOrder,
.localRegionSize = args.localRegionSize,
.maxFrontEndThreads = args.device->getDeviceInfo().maxFrontEndThreads,
.requiredSystemFence = args.postSyncArgs.requiresSystemMemoryFence(),
.requiredSystemFence = args.requiresSystemMemoryFence(),
.hasSample = false};
using INTERFACE_DESCRIPTOR_DATA = typename Family::INTERFACE_DESCRIPTOR_DATA;