performance: use resource_barrier on Xe2 and PTL

Related-To: NEO-14943

Signed-off-by: Szymon Morek <szymon.morek@intel.com>
This commit is contained in:
Szymon Morek
2025-06-23 13:42:45 +00:00
committed by Compute-Runtime-Automation
parent 6b33e62d15
commit 0db5ce22a1
35 changed files with 246 additions and 79 deletions

View File

@@ -26,7 +26,7 @@ size_t EncodeComputeMode<Family>::getCmdSizeForComputeMode(const RootDeviceEnvir
}
size += sizeof(typename Family::STATE_COMPUTE_MODE);
if (hasSharedHandles) {
size += MemorySynchronizationCommands<Family>::getSizeForSingleBarrier();
size += MemorySynchronizationCommands<Family>::getSizeForStallingBarrier();
}
if (productHelper.is3DPipelineSelectWARequired() && isRcs) {
size += (2 * PreambleHelper<Family>::getCmdSizeForPipelineSelect(rootDeviceEnvironment));

View File

@@ -831,7 +831,8 @@ size_t CommandStreamReceiverHw<GfxFamily>::getRequiredCmdStreamSize(const Dispat
}
if (debugManager.flags.ForcePipeControlPriorToWalker.get()) {
size += 2 * MemorySynchronizationCommands<GfxFamily>::getSizeForSingleBarrier();
size += MemorySynchronizationCommands<GfxFamily>::getSizeForStallingBarrier();
size += MemorySynchronizationCommands<GfxFamily>::getSizeForSingleBarrier();
}
return size;

View File

@@ -101,6 +101,7 @@ struct Gen12LpFamily : public Gen12Lp {
using XY_COLOR_BLT = typename GfxFamily::XY_FAST_COLOR_BLT;
using MI_STORE_REGISTER_MEM_CMD = typename GfxFamily::MI_STORE_REGISTER_MEM;
using TimestampPacketType = uint32_t;
using StallingBarrierType = PIPE_CONTROL;
static const GPGPU_WALKER cmdInitGpgpuWalker;
static const INTERFACE_DESCRIPTOR_DATA cmdInitInterfaceDescriptorData;
static const MEDIA_INTERFACE_DESCRIPTOR_LOAD cmdInitMediaInterfaceDescriptorLoad;

View File

@@ -491,6 +491,7 @@ struct MemorySynchronizationCommands {
static void setSingleBarrier(void *commandsBuffer, PostSyncMode postSyncMode, uint64_t gpuAddress, uint64_t immediateData, PipeControlArgs &args);
static void addSingleBarrier(LinearStream &commandStream, PipeControlArgs &args);
static void setSingleBarrier(void *commandsBuffer, PipeControlArgs &args);
static void setStallingBarrier(void *commandsBuffer, PipeControlArgs &args);
static void addBarrierWithPostSyncOperation(LinearStream &commandStream, PostSyncMode postSyncMode, uint64_t gpuAddress, uint64_t immediateData, const RootDeviceEnvironment &rootDeviceEnvironment, PipeControlArgs &args);
static void setBarrierWithPostSyncOperation(void *&commandsBuffer, PostSyncMode postSyncMode, uint64_t gpuAddress, uint64_t immediateData, const RootDeviceEnvironment &rootDeviceEnvironment, PipeControlArgs &args);
@@ -525,6 +526,7 @@ struct MemorySynchronizationCommands {
static size_t getSizeForSingleAdditionalSynchronization(NEO::FenceType fenceType, const RootDeviceEnvironment &rootDeviceEnvironment);
static size_t getSizeForAdditionalSynchronization(NEO::FenceType fenceType, const RootDeviceEnvironment &rootDeviceEnvironment);
static size_t getSizeForInstructionCacheFlush();
static size_t getSizeForStallingBarrier();
static bool isBarrierWaRequired(const RootDeviceEnvironment &rootDeviceEnvironment);
static bool isBarrierPriorToPipelineSelectWaRequired(const RootDeviceEnvironment &rootDeviceEnvironment);

View File

@@ -236,25 +236,40 @@ void MemorySynchronizationCommands<GfxFamily>::setSingleBarrier(void *commandsBu
template <typename GfxFamily>
void MemorySynchronizationCommands<GfxFamily>::addSingleBarrier(LinearStream &commandStream, PostSyncMode postSyncMode, uint64_t gpuAddress, uint64_t immediateData, PipeControlArgs &args) {
auto barrier = commandStream.getSpace(MemorySynchronizationCommands<GfxFamily>::getSizeForSingleBarrier());
auto barrierSize = MemorySynchronizationCommands<GfxFamily>::getSizeForSingleBarrier();
if (args.csStallOnly) {
barrierSize = MemorySynchronizationCommands<GfxFamily>::getSizeForStallingBarrier();
}
auto barrier = commandStream.getSpace(barrierSize);
setSingleBarrier(barrier, postSyncMode, gpuAddress, immediateData, args);
}
template <typename GfxFamily>
void MemorySynchronizationCommands<GfxFamily>::setSingleBarrier(void *commandsBuffer, PostSyncMode postSyncMode, uint64_t gpuAddress, uint64_t immediateData, PipeControlArgs &args) {
void MemorySynchronizationCommands<GfxFamily>::setStallingBarrier(void *commandsBuffer, PipeControlArgs &args) {
using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
PIPE_CONTROL pipeControl = GfxFamily::cmdInitPipeControl;
pipeControl.setCommandStreamerStallEnable(true);
setBarrierExtraProperties(&pipeControl, args);
*reinterpret_cast<PIPE_CONTROL *>(commandsBuffer) = pipeControl;
}
template <typename GfxFamily>
void MemorySynchronizationCommands<GfxFamily>::setSingleBarrier(void *commandsBuffer, PostSyncMode postSyncMode, uint64_t gpuAddress, uint64_t immediateData, PipeControlArgs &args) {
using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
if (args.csStallOnly) {
*reinterpret_cast<PIPE_CONTROL *>(commandsBuffer) = pipeControl;
setStallingBarrier(commandsBuffer, args);
return;
}
PIPE_CONTROL pipeControl = GfxFamily::cmdInitPipeControl;
pipeControl.setCommandStreamerStallEnable(true);
setBarrierExtraProperties(&pipeControl, args);
pipeControl.setConstantCacheInvalidationEnable(args.constantCacheInvalidationEnable);
pipeControl.setInstructionCacheInvalidateEnable(args.instructionCacheInvalidateEnable);
pipeControl.setPipeControlFlushEnable(args.pipeControlFlushEnable);
@@ -335,7 +350,7 @@ void MemorySynchronizationCommands<GfxFamily>::setBarrierWa(void *&commandsBuffe
additionalArgs.csStallOnly = true;
MemorySynchronizationCommands<GfxFamily>::setSingleBarrier(commandsBuffer, additionalArgs);
commandsBuffer = ptrOffset(commandsBuffer, sizeof(PIPE_CONTROL));
commandsBuffer = ptrOffset(commandsBuffer, getSizeForStallingBarrier());
}
}
@@ -384,7 +399,7 @@ size_t MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWa(const RootD
size = getSizeForSingleBarrier() +
getSizeForSingleAdditionalSynchronization(NEO::FenceType::release, rootDeviceEnvironment);
} else if (releaseHelper && postSyncMode == PostSyncMode::timestamp && releaseHelper->programmAdditionalStallPriorToBarrierWithTimestamp()) {
size = getSizeForSingleBarrier();
size = getSizeForStallingBarrier();
}
return size;
}
@@ -851,6 +866,11 @@ bool GfxCoreHelperHw<Family>::isCacheFlushPriorImageReadRequired() const {
return false;
}
template <typename GfxFamily>
size_t MemorySynchronizationCommands<GfxFamily>::getSizeForStallingBarrier() {
return sizeof(typename GfxFamily::StallingBarrierType);
}
template <typename Family>
uint32_t GfxCoreHelperHw<Family>::getQueuePriorityLevels() const {
return 2;

View File

@@ -291,6 +291,17 @@ uint32_t GfxCoreHelperHw<Family>::getDeviceTimestampWidth() const {
return 64u;
};
template <>
void MemorySynchronizationCommands<Family>::setStallingBarrier(void *commandsBuffer, PipeControlArgs &args) {
using RESOURCE_BARRIER = typename Family::RESOURCE_BARRIER;
auto resourceBarrier = Family::cmdInitResourceBarrier;
resourceBarrier.setBarrierType(RESOURCE_BARRIER::BARRIER_TYPE::BARRIER_TYPE_IMMEDIATE);
resourceBarrier.setWaitStage(RESOURCE_BARRIER::WAIT_STAGE::WAIT_STAGE_TOP);
resourceBarrier.setSignalStage(RESOURCE_BARRIER::SIGNAL_STAGE::SIGNAL_STAGE_GPGPU);
*reinterpret_cast<RESOURCE_BARRIER *>(commandsBuffer) = resourceBarrier;
}
} // namespace NEO
namespace NEO {

View File

@@ -108,6 +108,7 @@ struct Xe2HpgCoreFamily : public Xe2HpgCore {
using XY_COLOR_BLT = typename GfxFamily::XY_FAST_COLOR_BLT;
using MI_STORE_REGISTER_MEM_CMD = typename GfxFamily::MI_STORE_REGISTER_MEM;
using TimestampPacketType = uint64_t;
using StallingBarrierType = RESOURCE_BARRIER;
static const COMPUTE_WALKER cmdInitGpgpuWalker;
static const CFE_STATE cmdInitCfeState;
static const INTERFACE_DESCRIPTOR_DATA cmdInitInterfaceDescriptorData;

View File

@@ -259,6 +259,18 @@ uint32_t GfxCoreHelperHw<Family>::adjustMaxWorkGroupSize(const uint32_t grfCount
const uint32_t threadsPerThreadGroup = calculateNumThreadsPerThreadGroup(simd, defaultMaxGroupSize, grfCount, rootDeviceEnvironment);
return (threadsPerThreadGroup * simd);
}
template <>
void MemorySynchronizationCommands<Family>::setStallingBarrier(void *commandsBuffer, PipeControlArgs &args) {
using RESOURCE_BARRIER = typename Family::RESOURCE_BARRIER;
auto resourceBarrier = Family::cmdInitResourceBarrier;
resourceBarrier.setBarrierType(RESOURCE_BARRIER::BARRIER_TYPE::BARRIER_TYPE_IMMEDIATE);
resourceBarrier.setWaitStage(RESOURCE_BARRIER::WAIT_STAGE::WAIT_STAGE_TOP);
resourceBarrier.setSignalStage(RESOURCE_BARRIER::SIGNAL_STAGE::SIGNAL_STAGE_GPGPU);
*reinterpret_cast<RESOURCE_BARRIER *>(commandsBuffer) = resourceBarrier;
}
} // namespace NEO
namespace NEO {

View File

@@ -110,6 +110,7 @@ struct Xe3CoreFamily : public Xe3Core {
using XY_COLOR_BLT = typename GfxFamily::XY_FAST_COLOR_BLT;
using MI_STORE_REGISTER_MEM_CMD = typename GfxFamily::MI_STORE_REGISTER_MEM;
using TimestampPacketType = uint64_t;
using StallingBarrierType = RESOURCE_BARRIER;
static const COMPUTE_WALKER cmdInitGpgpuWalker;
static const CFE_STATE cmdInitCfeState;
static const INTERFACE_DESCRIPTOR_DATA cmdInitInterfaceDescriptorData;

View File

@@ -90,6 +90,7 @@ struct XeHpcCoreFamily : public XeHpcCore {
using XY_COLOR_BLT = typename GfxFamily::XY_FAST_COLOR_BLT;
using MI_STORE_REGISTER_MEM_CMD = typename GfxFamily::MI_STORE_REGISTER_MEM;
using TimestampPacketType = uint32_t;
using StallingBarrierType = PIPE_CONTROL;
static const COMPUTE_WALKER cmdInitGpgpuWalker;
static const CFE_STATE cmdInitCfeState;
static const INTERFACE_DESCRIPTOR_DATA cmdInitInterfaceDescriptorData;

View File

@@ -110,6 +110,7 @@ struct XeHpgCoreFamily : public XeHpgCore {
using XY_COLOR_BLT = typename GfxFamily::XY_FAST_COLOR_BLT;
using MI_STORE_REGISTER_MEM_CMD = typename GfxFamily::MI_STORE_REGISTER_MEM;
using TimestampPacketType = uint32_t;
using StallingBarrierType = PIPE_CONTROL;
static const COMPUTE_WALKER cmdInitGpgpuWalker;
static const CFE_STATE cmdInitCfeState;
static const INTERFACE_DESCRIPTOR_DATA cmdInitInterfaceDescriptorData;