mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-20 00:24:58 +08:00
refactor: add EncodePostSyncArgs to EncodeDispatchKernelArgs
Refactored various member variables in EncodeDispatchKernel to directly include EncodePostSync Changed command encoder and command list to use the modified EncodeDispatchKernel. Related-To: NEO-13003 Signed-off-by: Young Jin Yoon <young.jin.yoon@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
f335295432
commit
40aef1555e
@@ -49,7 +49,7 @@ struct StateComputeModeProperties;
|
||||
struct ImplicitArgs;
|
||||
struct EncodeKernelArgsExt;
|
||||
|
||||
struct EncodeDispatchKernelArgs {
|
||||
struct EncodePostSyncArgs {
|
||||
uint64_t eventAddress = 0;
|
||||
uint64_t postSyncImmValue = 0;
|
||||
uint64_t inOrderCounterValue = 0;
|
||||
@@ -57,6 +57,53 @@ struct EncodeDispatchKernelArgs {
|
||||
uint64_t inOrderIncrementValue = 0;
|
||||
Device *device = nullptr;
|
||||
NEO::InOrderExecInfo *inOrderExecInfo = nullptr;
|
||||
bool isTimestampEvent = false;
|
||||
bool isHostScopeSignalEvent = false;
|
||||
bool isKernelUsingSystemAllocation = false;
|
||||
bool dcFlushEnable = false;
|
||||
bool interruptEvent = false;
|
||||
bool isFlushL3ForExternalAllocationRequired = false;
|
||||
bool isFlushL3ForHostUsmRequired = false;
|
||||
|
||||
bool requiresSystemMemoryFence() const {
|
||||
return (isHostScopeSignalEvent && isKernelUsingSystemAllocation && !device->getHardwareInfo().capabilityTable.isIntegratedDevice);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename GfxFamily>
|
||||
struct EncodePostSync {
|
||||
static constexpr size_t timestampDestinationAddressAlignment = 16;
|
||||
static constexpr size_t immWriteDestinationAddressAlignment = 8;
|
||||
|
||||
template <typename CommandType>
|
||||
static void encodeL3Flush(CommandType &cmd, const EncodePostSyncArgs &args);
|
||||
|
||||
template <typename CommandType>
|
||||
static void setupPostSyncForRegularEvent(CommandType &cmd, const EncodePostSyncArgs &args);
|
||||
|
||||
template <typename CommandType>
|
||||
static void setupPostSyncForInOrderExec(CommandType &cmd, const EncodePostSyncArgs &args);
|
||||
|
||||
static uint32_t getPostSyncMocs(const RootDeviceEnvironment &rootDeviceEnvironment, const bool dcFlush);
|
||||
|
||||
template <typename CommandType>
|
||||
static auto &getPostSync(CommandType &cmd, size_t index);
|
||||
|
||||
template <typename PostSyncT>
|
||||
static void setPostSyncData(PostSyncT &postSyncData, const typename PostSyncT::OPERATION operation, const uint64_t gpuVa, const uint64_t immData, [[maybe_unused]] const uint32_t atomicOpcode, const uint32_t mocs, [[maybe_unused]] const bool interrupt, const bool requiresSystemMemoryFence);
|
||||
|
||||
template <typename PostSyncT>
|
||||
static void setPostSyncDataCommon(PostSyncT &postSyncData, const typename PostSyncT::OPERATION operation, const uint64_t gpuVa, const uint64_t immData);
|
||||
|
||||
template <typename CommandType>
|
||||
static void setCommandLevelInterrupt(CommandType &cmd, bool interrupt);
|
||||
|
||||
template <typename CommandType>
|
||||
static void adjustTimestampPacket(CommandType &cmd, const EncodePostSyncArgs &args);
|
||||
};
|
||||
|
||||
struct EncodeDispatchKernelArgs {
|
||||
Device *device = nullptr;
|
||||
DispatchKernelEncoderI *dispatchInterface = nullptr;
|
||||
IndirectHeap *surfaceStateHeap = nullptr;
|
||||
IndirectHeap *dynamicStateHeap = nullptr;
|
||||
@@ -67,6 +114,7 @@ struct EncodeDispatchKernelArgs {
|
||||
void *outImplicitArgsPtr = nullptr;
|
||||
std::list<void *> *additionalCommands = nullptr;
|
||||
EncodeKernelArgsExt *extendedArgs = nullptr;
|
||||
NEO::EncodePostSyncArgs postSyncArgs{};
|
||||
PreemptionMode preemptionMode = PreemptionMode::Initial;
|
||||
NEO::RequiredPartitionDim requiredPartitionDim = NEO::RequiredPartitionDim::none;
|
||||
NEO::RequiredDispatchWalkOrder requiredDispatchWalkOrder = NEO::RequiredDispatchWalkOrder::none;
|
||||
@@ -77,26 +125,15 @@ struct EncodeDispatchKernelArgs {
|
||||
int32_t defaultPipelinedThreadArbitrationPolicy = NEO::ThreadArbitrationPolicy::NotPresent;
|
||||
bool isIndirect = false;
|
||||
bool isPredicate = false;
|
||||
bool isTimestampEvent = false;
|
||||
bool requiresUncachedMocs = false;
|
||||
bool isInternal = false;
|
||||
bool isCooperative = false;
|
||||
bool isHostScopeSignalEvent = false;
|
||||
bool isKernelUsingSystemAllocation = false;
|
||||
bool isKernelDispatchedFromImmediateCmdList = false;
|
||||
bool isRcs = false;
|
||||
bool dcFlushEnable = false;
|
||||
bool isHeaplessModeEnabled = false;
|
||||
bool isHeaplessStateInitEnabled = false;
|
||||
bool interruptEvent = false;
|
||||
bool immediateScratchAddressPatching = false;
|
||||
bool makeCommandView = false;
|
||||
bool isFlushL3AfterPostSyncForExternalAllocationRequired = false;
|
||||
bool isFlushL3AfterPostSyncForHostUsmRequired = false;
|
||||
|
||||
bool requiresSystemMemoryFence() const {
|
||||
return (isHostScopeSignalEvent && isKernelUsingSystemAllocation && !device->getHardwareInfo().capabilityTable.isIntegratedDevice);
|
||||
}
|
||||
};
|
||||
|
||||
struct EncodeStoreMMIOParams {
|
||||
@@ -709,58 +746,4 @@ struct EnodeUserInterrupt {
|
||||
static void encode(LinearStream &commandStream);
|
||||
};
|
||||
|
||||
struct EncodePostSyncArgs {
|
||||
uint64_t eventAddress = 0;
|
||||
uint64_t postSyncImmValue = 0;
|
||||
uint64_t inOrderCounterValue = 0;
|
||||
uint64_t inOrderIncrementGpuAddress = 0;
|
||||
uint64_t inOrderIncrementValue = 0;
|
||||
Device *device = nullptr;
|
||||
NEO::InOrderExecInfo *inOrderExecInfo = nullptr;
|
||||
bool isTimestampEvent = false;
|
||||
bool isHostScopeSignalEvent = false;
|
||||
bool isKernelUsingSystemAllocation = false;
|
||||
bool dcFlushEnable = false;
|
||||
bool interruptEvent = false;
|
||||
bool isFlushL3ForExternalAllocationRequired = false;
|
||||
bool isFlushL3ForHostUsmRequired = false;
|
||||
bool requiresSystemMemoryFence() const {
|
||||
return (isHostScopeSignalEvent && isKernelUsingSystemAllocation);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename GfxFamily>
|
||||
struct EncodePostSync {
|
||||
static constexpr size_t timestampDestinationAddressAlignment = 16;
|
||||
static constexpr size_t immWriteDestinationAddressAlignment = 8;
|
||||
|
||||
static EncodePostSyncArgs createPostSyncArgs(const EncodeDispatchKernelArgs &args);
|
||||
|
||||
template <typename CommandType>
|
||||
static void encodeL3Flush(CommandType &cmd, const EncodePostSyncArgs &args);
|
||||
|
||||
template <typename CommandType>
|
||||
static void setupPostSyncForRegularEvent(CommandType &cmd, const EncodePostSyncArgs &args);
|
||||
|
||||
template <typename CommandType>
|
||||
static void setupPostSyncForInOrderExec(CommandType &cmd, const EncodePostSyncArgs &args);
|
||||
|
||||
static uint32_t getPostSyncMocs(const RootDeviceEnvironment &rootDeviceEnvironment, const bool dcFlush);
|
||||
|
||||
template <typename CommandType>
|
||||
static auto &getPostSync(CommandType &cmd, size_t index);
|
||||
|
||||
template <typename PostSyncT>
|
||||
static void setPostSyncData(PostSyncT &postSyncData, const typename PostSyncT::OPERATION operation, const uint64_t gpuVa, const uint64_t immData, [[maybe_unused]] const uint32_t atomicOpcode, const uint32_t mocs, [[maybe_unused]] const bool interrupt, const bool requiresSystemMemoryFence);
|
||||
|
||||
template <typename PostSyncT>
|
||||
static void setPostSyncDataCommon(PostSyncT &postSyncData, const typename PostSyncT::OPERATION operation, const uint64_t gpuVa, const uint64_t immData);
|
||||
|
||||
template <typename CommandType>
|
||||
static void setCommandLevelInterrupt(CommandType &cmd, bool interrupt);
|
||||
|
||||
template <typename CommandType>
|
||||
static void adjustTimestampPacket(CommandType &cmd, const EncodePostSyncArgs &args);
|
||||
};
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
@@ -1173,23 +1173,4 @@ void EncodeComputeMode<Family>::adjustPipelineSelect(CommandContainer &container
|
||||
container.getDevice()->getRootDeviceEnvironment());
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
EncodePostSyncArgs EncodePostSync<Family>::createPostSyncArgs(const EncodeDispatchKernelArgs &args) {
|
||||
return EncodePostSyncArgs{
|
||||
.eventAddress = args.eventAddress,
|
||||
.postSyncImmValue = args.postSyncImmValue,
|
||||
.inOrderCounterValue = args.inOrderCounterValue,
|
||||
.inOrderIncrementGpuAddress = args.inOrderIncrementGpuAddress,
|
||||
.inOrderIncrementValue = args.inOrderIncrementValue,
|
||||
.device = args.device,
|
||||
.inOrderExecInfo = args.inOrderExecInfo,
|
||||
.isTimestampEvent = args.isTimestampEvent,
|
||||
.isHostScopeSignalEvent = args.isHostScopeSignalEvent,
|
||||
.isKernelUsingSystemAllocation = args.isKernelUsingSystemAllocation,
|
||||
.dcFlushEnable = args.dcFlushEnable,
|
||||
.interruptEvent = args.interruptEvent,
|
||||
.isFlushL3ForExternalAllocationRequired = args.isFlushL3AfterPostSyncForExternalAllocationRequired,
|
||||
.isFlushL3ForHostUsmRequired = args.isFlushL3AfterPostSyncForHostUsmRequired};
|
||||
}
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
@@ -95,10 +95,6 @@ template <typename Family>
|
||||
template <typename CommandType>
|
||||
void EncodePostSync<Family>::encodeL3Flush(CommandType &cmd, const EncodePostSyncArgs &args) {}
|
||||
|
||||
template <typename Family>
|
||||
template <typename CommandType>
|
||||
void EncodePostSync<Family>::setCommandLevelInterrupt(CommandType &cmd, bool interrupt) {}
|
||||
|
||||
template <typename Family>
|
||||
template <typename WalkerType>
|
||||
void EncodeDispatchKernel<Family>::setWalkerRegionSettings(WalkerType &walkerCmd, const NEO::Device &device, uint32_t partitionCount, uint32_t workgroupSize, uint32_t threadGroupCount, uint32_t maxWgCountPerTile, bool requiredDispatchWalkOrder) {}
|
||||
|
||||
@@ -304,7 +304,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
||||
args.requiresUncachedMocs) {
|
||||
|
||||
PipeControlArgs syncArgs;
|
||||
syncArgs.dcFlushEnable = args.dcFlushEnable;
|
||||
syncArgs.dcFlushEnable = args.postSyncArgs.dcFlushEnable;
|
||||
MemorySynchronizationCommands<Family>::addSingleBarrier(*container.getCommandStream(), syncArgs);
|
||||
STATE_BASE_ADDRESS sbaCmd;
|
||||
auto gmmHelper = container.getDevice()->getGmmHelper();
|
||||
@@ -370,11 +370,10 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
||||
requiredWorkgroupOrder,
|
||||
rootDeviceEnvironment);
|
||||
|
||||
auto postSyncArgs = EncodePostSync<Family>::createPostSyncArgs(args);
|
||||
if (args.inOrderExecInfo) {
|
||||
EncodePostSync<Family>::setupPostSyncForInOrderExec(walkerCmd, postSyncArgs);
|
||||
} else if (args.eventAddress) {
|
||||
EncodePostSync<Family>::setupPostSyncForRegularEvent(walkerCmd, postSyncArgs);
|
||||
if (args.postSyncArgs.inOrderExecInfo) {
|
||||
EncodePostSync<Family>::setupPostSyncForInOrderExec(walkerCmd, args.postSyncArgs);
|
||||
} else if (args.postSyncArgs.eventAddress) {
|
||||
EncodePostSync<Family>::setupPostSyncForRegularEvent(walkerCmd, args.postSyncArgs);
|
||||
} else {
|
||||
EncodeDispatchKernel<Family>::forceComputeWalkerPostSyncFlushWithWrite(walkerCmd);
|
||||
}
|
||||
@@ -416,7 +415,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
||||
.requiredDispatchWalkOrder = args.requiredDispatchWalkOrder,
|
||||
.localRegionSize = args.localRegionSize,
|
||||
.maxFrontEndThreads = args.device->getDeviceInfo().maxFrontEndThreads,
|
||||
.requiredSystemFence = args.requiresSystemMemoryFence() && args.device->getGfxCoreHelper().isFenceAllocationRequired(hwInfo),
|
||||
.requiredSystemFence = args.postSyncArgs.requiresSystemMemoryFence() && args.device->getGfxCoreHelper().isFenceAllocationRequired(hwInfo),
|
||||
.hasSample = kernelDescriptor.kernelAttributes.flags.hasSample,
|
||||
.l0DebuggerEnabled = args.device->getL0Debugger() != nullptr};
|
||||
|
||||
@@ -442,7 +441,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
||||
args.maxWgCountPerTile, // maxWgCountPerTile
|
||||
!(container.getFlushTaskUsedForImmediate() || container.isUsingPrimaryBuffer()), // useSecondaryBatchBuffer
|
||||
!args.isKernelDispatchedFromImmediateCmdList, // apiSelfCleanup
|
||||
args.dcFlushEnable, // dcFlush
|
||||
args.postSyncArgs.dcFlushEnable, // dcFlush
|
||||
EncodeDispatchKernel<Family>::singleTileExecImplicitScalingRequired(args.isCooperative), // forceExecutionOnSingleTile
|
||||
args.makeCommandView, // blockDispatchToCommandBuffer
|
||||
isRequiredDispatchWorkGroupOrder}; // isRequiredDispatchWorkGroupOrder
|
||||
|
||||
@@ -230,7 +230,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
||||
|
||||
if (flush) {
|
||||
PipeControlArgs syncArgs;
|
||||
syncArgs.dcFlushEnable = args.dcFlushEnable;
|
||||
syncArgs.dcFlushEnable = args.postSyncArgs.dcFlushEnable;
|
||||
if (dirtyHeaps) {
|
||||
syncArgs.hdcPipelineFlush = true;
|
||||
}
|
||||
@@ -298,7 +298,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
||||
.requiredDispatchWalkOrder = args.requiredDispatchWalkOrder,
|
||||
.localRegionSize = args.localRegionSize,
|
||||
.maxFrontEndThreads = args.device->getDeviceInfo().maxFrontEndThreads,
|
||||
.requiredSystemFence = args.requiresSystemMemoryFence(),
|
||||
.requiredSystemFence = args.postSyncArgs.requiresSystemMemoryFence(),
|
||||
.hasSample = false};
|
||||
|
||||
using INTERFACE_DESCRIPTOR_DATA = typename Family::INTERFACE_DESCRIPTOR_DATA;
|
||||
|
||||
Reference in New Issue
Block a user