mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-04 15:53:45 +08:00
Optimize timestamp packet dependencies
- Clear dependencies even if last engine changed - Do no program semaphore waiting for blit when blit is submitted with gpgpu - Track barrier timestamps to correctly synchronize blits in OOQ Related-To: NEO-6444 Signed-off-by: Maciej Dziuban <maciej.dziuban@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
f20236c7f2
commit
e3bb526067
@@ -681,7 +681,6 @@ void CommandQueue::updateBcsTaskCount(aub_stream::EngineType bcsEngineType, uint
|
||||
|
||||
uint32_t CommandQueue::peekBcsTaskCount(aub_stream::EngineType bcsEngineType) const {
|
||||
const CopyEngineState &state = bcsStates[EngineHelpers::getBcsIndex(bcsEngineType)];
|
||||
DEBUG_BREAK_IF(!state.isValid());
|
||||
return state.taskCount;
|
||||
}
|
||||
|
||||
@@ -706,10 +705,6 @@ void CommandQueue::obtainNewTimestampPacketNodes(size_t numberOfNodes, Timestamp
|
||||
|
||||
previousNodes.swapNodes(*timestampPacketContainer);
|
||||
|
||||
if ((previousNodes.peekNodes().size() > 0) && (previousNodes.peekNodes()[0]->getAllocator() != allocator)) {
|
||||
clearAllDependencies = false;
|
||||
}
|
||||
|
||||
if (clearAllDependencies) {
|
||||
previousNodes.moveNodesToNewContainer(*deferredTimestampPackets);
|
||||
}
|
||||
@@ -1007,4 +1002,61 @@ void CommandQueue::waitForAllEngines(bool blockedQueue, PrintfHandler *printfHan
|
||||
}
|
||||
}
|
||||
|
||||
void CommandQueue::setupBarrierTimestampForBcsEngines(aub_stream::EngineType engineType, TimestampPacketDependencies ×tampPacketDependencies) {
|
||||
if (!getGpgpuCommandStreamReceiver().isStallingCommandsOnNextFlushRequired()) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Ensure we have exactly 1 barrier node.
|
||||
if (timestampPacketDependencies.barrierNodes.peekNodes().empty()) {
|
||||
timestampPacketDependencies.barrierNodes.add(getGpgpuCommandStreamReceiver().getTimestampPacketAllocator()->getTag());
|
||||
}
|
||||
|
||||
if (isOOQEnabled()) {
|
||||
// Barrier node will be signalled on gpgpuCsr. Save it for later use on blitters.
|
||||
for (auto currentBcsIndex = 0u; currentBcsIndex < bcsTimestampPacketContainers.size(); currentBcsIndex++) {
|
||||
const auto currentBcsEngineType = EngineHelpers::mapBcsIndexToEngineType(currentBcsIndex, true);
|
||||
if (currentBcsEngineType == engineType) {
|
||||
// Node is already added to barrierNodes for this engine, no need to save it.
|
||||
continue;
|
||||
}
|
||||
|
||||
// Save latest timestamp (override previous, if any).
|
||||
TimestampPacketContainer newContainer{};
|
||||
newContainer.assignAndIncrementNodesRefCounts(timestampPacketDependencies.barrierNodes);
|
||||
bcsTimestampPacketContainers[currentBcsIndex].lastBarrierToWaitFor.swapNodes(newContainer);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void CommandQueue::processBarrierTimestampForBcsEngine(aub_stream::EngineType bcsEngineType, TimestampPacketDependencies &blitDependencies) {
|
||||
BcsTimestampPacketContainers &bcsContainers = bcsTimestampPacketContainers[EngineHelpers::getBcsIndex(bcsEngineType)];
|
||||
bcsContainers.lastBarrierToWaitFor.moveNodesToNewContainer(blitDependencies.barrierNodes);
|
||||
}
|
||||
|
||||
void CommandQueue::setLastBcsPacket(aub_stream::EngineType bcsEngineType) {
|
||||
if (isOOQEnabled()) {
|
||||
TimestampPacketContainer dummyContainer{};
|
||||
dummyContainer.assignAndIncrementNodesRefCounts(*this->timestampPacketContainer);
|
||||
|
||||
BcsTimestampPacketContainers &bcsContainers = bcsTimestampPacketContainers[EngineHelpers::getBcsIndex(bcsEngineType)];
|
||||
bcsContainers.lastSignalledPacket.swapNodes(dummyContainer);
|
||||
}
|
||||
}
|
||||
|
||||
void CommandQueue::fillCsrDependenciesWithLastBcsPackets(CsrDependencies &csrDeps) {
|
||||
for (BcsTimestampPacketContainers &bcsContainers : bcsTimestampPacketContainers) {
|
||||
if (bcsContainers.lastSignalledPacket.peekNodes().empty()) {
|
||||
continue;
|
||||
}
|
||||
csrDeps.timestampPacketContainer.push_back(&bcsContainers.lastSignalledPacket);
|
||||
}
|
||||
}
|
||||
|
||||
void CommandQueue::clearLastBcsPackets() {
|
||||
for (BcsTimestampPacketContainers &bcsContainers : bcsTimestampPacketContainers) {
|
||||
bcsContainers.lastSignalledPacket.moveNodesToNewContainer(*deferredTimestampPackets);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
@@ -325,6 +325,12 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
|
||||
|
||||
void updateLatestSentEnqueueType(EnqueueProperties::Operation newEnqueueType) { this->latestSentEnqueueType = newEnqueueType; }
|
||||
|
||||
void setupBarrierTimestampForBcsEngines(aub_stream::EngineType engineType, TimestampPacketDependencies ×tampPacketDependencies);
|
||||
void processBarrierTimestampForBcsEngine(aub_stream::EngineType bcsEngineType, TimestampPacketDependencies &blitDependencies);
|
||||
void setLastBcsPacket(aub_stream::EngineType bcsEngineType);
|
||||
void fillCsrDependenciesWithLastBcsPackets(CsrDependencies &csrDeps);
|
||||
void clearLastBcsPackets();
|
||||
|
||||
// taskCount of last task
|
||||
uint32_t taskCount = 0;
|
||||
|
||||
@@ -409,6 +415,11 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
|
||||
|
||||
std::unique_ptr<TimestampPacketContainer> deferredTimestampPackets;
|
||||
std::unique_ptr<TimestampPacketContainer> timestampPacketContainer;
|
||||
struct BcsTimestampPacketContainers {
|
||||
TimestampPacketContainer lastBarrierToWaitFor;
|
||||
TimestampPacketContainer lastSignalledPacket;
|
||||
};
|
||||
std::array<BcsTimestampPacketContainers, bcsInfoMaskSize> bcsTimestampPacketContainers;
|
||||
};
|
||||
|
||||
template <typename PtrType>
|
||||
|
||||
@@ -249,6 +249,10 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
|
||||
timestampPacketDependencies, eventsRequest, blockQueue);
|
||||
}
|
||||
|
||||
if (!blockQueue && isOOQEnabled()) {
|
||||
setupBarrierTimestampForBcsEngines(getGpgpuCommandStreamReceiver().getOsContext().getEngineType(), timestampPacketDependencies);
|
||||
}
|
||||
|
||||
if (eventBuilder.getEvent() && getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
|
||||
eventBuilder.getEvent()->addTimestampPacketNodes(*timestampPacketContainer);
|
||||
eventBuilder.getEvent()->addTimestampPacketNodes(timestampPacketDependencies.nonAuxToAuxNodes);
|
||||
@@ -536,8 +540,6 @@ BlitProperties CommandQueueHw<GfxFamily>::processDispatchForBlitEnqueue(CommandS
|
||||
device->getHardwareInfo(),
|
||||
args);
|
||||
}
|
||||
|
||||
TimestampPacketHelper::programSemaphore<GfxFamily>(*commandStream, *currentTimestampPacketNode);
|
||||
}
|
||||
return blitProperties;
|
||||
}
|
||||
@@ -898,8 +900,13 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueNonBlocked(
|
||||
dispatchFlags.pipelineSelectArgs.mediaSamplerRequired = mediaSamplerRequired;
|
||||
dispatchFlags.pipelineSelectArgs.specialPipelineSelectMode = specialPipelineSelectMode;
|
||||
|
||||
const bool isHandlingBarrier = getGpgpuCommandStreamReceiver().isStallingCommandsOnNextFlushRequired();
|
||||
|
||||
if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled() && !clearDependenciesForSubCapture) {
|
||||
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(dispatchFlags.csrDependencies, getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OutOfCsr);
|
||||
if (isHandlingBarrier) {
|
||||
fillCsrDependenciesWithLastBcsPackets(dispatchFlags.csrDependencies);
|
||||
}
|
||||
dispatchFlags.csrDependencies.makeResident(getGpgpuCommandStreamReceiver());
|
||||
}
|
||||
|
||||
@@ -937,6 +944,10 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueNonBlocked(
|
||||
dispatchFlags,
|
||||
getDevice());
|
||||
|
||||
if (isHandlingBarrier) {
|
||||
clearLastBcsPackets();
|
||||
}
|
||||
|
||||
if (gtpinIsGTPinInitialized()) {
|
||||
gtpinNotifyFlushTask(completionStamp.taskCount);
|
||||
}
|
||||
@@ -1119,8 +1130,13 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueCommandWithoutKernel(
|
||||
false, //memoryMigrationRequired
|
||||
false); //textureCacheFlush
|
||||
|
||||
const bool isHandlingBarrier = getGpgpuCommandStreamReceiver().isStallingCommandsOnNextFlushRequired();
|
||||
|
||||
if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
|
||||
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(dispatchFlags.csrDependencies, getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OutOfCsr);
|
||||
if (isHandlingBarrier) {
|
||||
fillCsrDependenciesWithLastBcsPackets(dispatchFlags.csrDependencies);
|
||||
}
|
||||
dispatchFlags.csrDependencies.makeResident(getGpgpuCommandStreamReceiver());
|
||||
}
|
||||
|
||||
@@ -1133,6 +1149,10 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueCommandWithoutKernel(
|
||||
taskLevel,
|
||||
dispatchFlags,
|
||||
getDevice());
|
||||
|
||||
if (isHandlingBarrier) {
|
||||
clearLastBcsPackets();
|
||||
}
|
||||
}
|
||||
|
||||
if (enqueueProperties.operation == EnqueueProperties::Operation::Blit) {
|
||||
@@ -1208,9 +1228,10 @@ void CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDispat
|
||||
timestampPacketDependencies.cacheFlushNodes.add(allocator->getTag());
|
||||
}
|
||||
|
||||
if (!blockQueue && getGpgpuCommandStreamReceiver().isStallingCommandsOnNextFlushRequired()) {
|
||||
timestampPacketDependencies.barrierNodes.add(allocator->getTag());
|
||||
if (!blockQueue) {
|
||||
setupBarrierTimestampForBcsEngines(bcsCsr.getOsContext().getEngineType(), timestampPacketDependencies);
|
||||
}
|
||||
processBarrierTimestampForBcsEngine(bcsCsr.getOsContext().getEngineType(), timestampPacketDependencies);
|
||||
|
||||
obtainNewTimestampPacketNodes(1, timestampPacketDependencies.previousEnqueueNodes, clearAllDependencies, bcsCsr);
|
||||
csrDeps.timestampPacketContainer.push_back(×tampPacketDependencies.previousEnqueueNodes);
|
||||
@@ -1243,6 +1264,8 @@ void CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDispat
|
||||
}
|
||||
|
||||
this->latestSentEnqueueType = enqueueProperties.operation;
|
||||
|
||||
setLastBcsPacket(bcsCsr.getOsContext().getEngineType());
|
||||
}
|
||||
updateFromCompletionStamp(completionStamp, eventBuilder.getEvent());
|
||||
|
||||
|
||||
@@ -215,6 +215,10 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate
|
||||
commandQueue.getGpgpuCommandStreamReceiver(), *bcsCsrForAuxTranslation);
|
||||
}
|
||||
|
||||
if (timestampPacketDependencies && commandQueue.isOOQEnabled()) {
|
||||
commandQueue.setupBarrierTimestampForBcsEngines(commandQueue.getGpgpuCommandStreamReceiver().getOsContext().getEngineType(), *timestampPacketDependencies);
|
||||
}
|
||||
|
||||
const auto &kernelDescriptor = kernel->getKernelInfo().kernelDescriptor;
|
||||
|
||||
auto memoryCompressionState = commandStreamReceiver.getMemoryCompressionState(kernel->isAuxTranslationRequired(), commandQueue.getDevice().getHardwareInfo());
|
||||
@@ -254,8 +258,13 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate
|
||||
eventsRequest.fillCsrDependenciesForTaskCountContainer(dispatchFlags.csrDependencies, commandStreamReceiver);
|
||||
}
|
||||
|
||||
const bool isHandlingBarrier = commandQueue.getGpgpuCommandStreamReceiver().isStallingCommandsOnNextFlushRequired();
|
||||
|
||||
if (timestampPacketDependencies) {
|
||||
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(dispatchFlags.csrDependencies, commandStreamReceiver, CsrDependencies::DependenciesType::OutOfCsr);
|
||||
if (isHandlingBarrier) {
|
||||
commandQueue.fillCsrDependenciesWithLastBcsPackets(dispatchFlags.csrDependencies);
|
||||
}
|
||||
dispatchFlags.barrierTimestampPacketNodes = ×tampPacketDependencies->barrierNodes;
|
||||
}
|
||||
dispatchFlags.pipelineSelectArgs.specialPipelineSelectMode = kernel->requiresSpecialPipelineSelectMode();
|
||||
@@ -289,6 +298,10 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate
|
||||
dispatchFlags,
|
||||
commandQueue.getDevice());
|
||||
|
||||
if (isHandlingBarrier) {
|
||||
commandQueue.clearLastBcsPackets();
|
||||
}
|
||||
|
||||
if (kernelOperation->blitPropertiesContainer.size() > 0) {
|
||||
const auto newTaskCount = bcsCsrForAuxTranslation->blitBuffer(kernelOperation->blitPropertiesContainer, false, commandQueue.isProfilingEnabled(), commandQueue.getDevice());
|
||||
commandQueue.updateBcsTaskCount(bcsCsrForAuxTranslation->getOsContext().getEngineType(), newTaskCount);
|
||||
@@ -330,6 +343,7 @@ void CommandWithoutKernel::dispatchBlitOperation() {
|
||||
|
||||
const auto newTaskCount = bcsCsr->blitBuffer(kernelOperation->blitPropertiesContainer, false, commandQueue.isProfilingEnabled(), commandQueue.getDevice());
|
||||
commandQueue.updateBcsTaskCount(bcsCsr->getOsContext().getEngineType(), newTaskCount);
|
||||
commandQueue.setLastBcsPacket(bcsCsr->getOsContext().getEngineType());
|
||||
}
|
||||
|
||||
CompletionStamp &CommandWithoutKernel::submit(uint32_t taskLevel, bool terminated) {
|
||||
@@ -361,6 +375,10 @@ CompletionStamp &CommandWithoutKernel::submit(uint32_t taskLevel, bool terminate
|
||||
}
|
||||
}
|
||||
|
||||
if (timestampPacketDependencies && commandQueue.isOOQEnabled()) {
|
||||
commandQueue.setupBarrierTimestampForBcsEngines(commandQueue.getGpgpuCommandStreamReceiver().getOsContext().getEngineType(), *timestampPacketDependencies);
|
||||
}
|
||||
|
||||
auto rootDeviceIndex = commandStreamReceiver.getRootDeviceIndex();
|
||||
DispatchFlags dispatchFlags(
|
||||
{}, //csrDependencies
|
||||
@@ -397,8 +415,13 @@ CompletionStamp &CommandWithoutKernel::submit(uint32_t taskLevel, bool terminate
|
||||
eventsRequest.fillCsrDependenciesForTaskCountContainer(dispatchFlags.csrDependencies, commandStreamReceiver);
|
||||
}
|
||||
|
||||
const bool isHandlingBarrier = commandQueue.getGpgpuCommandStreamReceiver().isStallingCommandsOnNextFlushRequired();
|
||||
|
||||
if (commandStreamReceiver.peekTimestampPacketWriteEnabled()) {
|
||||
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(dispatchFlags.csrDependencies, commandStreamReceiver, CsrDependencies::DependenciesType::OutOfCsr);
|
||||
if (isHandlingBarrier) {
|
||||
commandQueue.fillCsrDependenciesWithLastBcsPackets(dispatchFlags.csrDependencies);
|
||||
}
|
||||
makeTimestampPacketsResident(commandStreamReceiver);
|
||||
}
|
||||
|
||||
@@ -413,6 +436,10 @@ CompletionStamp &CommandWithoutKernel::submit(uint32_t taskLevel, bool terminate
|
||||
dispatchFlags,
|
||||
commandQueue.getDevice());
|
||||
|
||||
if (isHandlingBarrier) {
|
||||
commandQueue.clearLastBcsPackets();
|
||||
}
|
||||
|
||||
if (kernelOperation->blitEnqueue) {
|
||||
dispatchBlitOperation();
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user