Optimize timestamp packet dependencies

Signed-off-by: Lukasz Jobczyk <lukasz.jobczyk@intel.com>
This commit is contained in:
Lukasz Jobczyk
2021-12-29 14:28:21 +00:00
committed by Compute-Runtime-Automation
parent 882ae8088f
commit 95585a81f7
11 changed files with 529 additions and 39 deletions

View File

@@ -683,7 +683,6 @@ void CommandQueue::updateBcsTaskCount(aub_stream::EngineType bcsEngineType, uint
uint32_t CommandQueue::peekBcsTaskCount(aub_stream::EngineType bcsEngineType) const {
const CopyEngineState &state = bcsStates[EngineHelpers::getBcsIndex(bcsEngineType)];
DEBUG_BREAK_IF(!state.isValid());
return state.taskCount;
}
@@ -708,10 +707,6 @@ void CommandQueue::obtainNewTimestampPacketNodes(size_t numberOfNodes, Timestamp
previousNodes.swapNodes(*timestampPacketContainer);
if ((previousNodes.peekNodes().size() > 0) && (previousNodes.peekNodes()[0]->getAllocator() != allocator)) {
clearAllDependencies = false;
}
if (clearAllDependencies) {
previousNodes.moveNodesToNewContainer(*deferredTimestampPackets);
}
@@ -1016,4 +1011,61 @@ void CommandQueue::waitForAllEngines(bool blockedQueue, PrintfHandler *printfHan
}
}
void CommandQueue::setupBarrierTimestampForBcsEngines(aub_stream::EngineType engineType, TimestampPacketDependencies &timestampPacketDependencies) {
if (!getGpgpuCommandStreamReceiver().isStallingCommandsOnNextFlushRequired()) {
return;
}
// Ensure we have exactly 1 barrier node.
if (timestampPacketDependencies.barrierNodes.peekNodes().empty()) {
timestampPacketDependencies.barrierNodes.add(getGpgpuCommandStreamReceiver().getTimestampPacketAllocator()->getTag());
}
if (isOOQEnabled()) {
// Barrier node will be signalled on gpgpuCsr. Save it for later use on blitters.
for (auto currentBcsIndex = 0u; currentBcsIndex < bcsTimestampPacketContainers.size(); currentBcsIndex++) {
const auto currentBcsEngineType = EngineHelpers::mapBcsIndexToEngineType(currentBcsIndex, true);
if (currentBcsEngineType == engineType) {
// Node is already added to barrierNodes for this engine, no need to save it.
continue;
}
// Save latest timestamp (override previous, if any).
TimestampPacketContainer newContainer{};
newContainer.assignAndIncrementNodesRefCounts(timestampPacketDependencies.barrierNodes);
bcsTimestampPacketContainers[currentBcsIndex].lastBarrierToWaitFor.swapNodes(newContainer);
}
}
}
void CommandQueue::processBarrierTimestampForBcsEngine(aub_stream::EngineType bcsEngineType, TimestampPacketDependencies &blitDependencies) {
BcsTimestampPacketContainers &bcsContainers = bcsTimestampPacketContainers[EngineHelpers::getBcsIndex(bcsEngineType)];
bcsContainers.lastBarrierToWaitFor.moveNodesToNewContainer(blitDependencies.barrierNodes);
}
void CommandQueue::setLastBcsPacket(aub_stream::EngineType bcsEngineType) {
if (isOOQEnabled()) {
TimestampPacketContainer dummyContainer{};
dummyContainer.assignAndIncrementNodesRefCounts(*this->timestampPacketContainer);
BcsTimestampPacketContainers &bcsContainers = bcsTimestampPacketContainers[EngineHelpers::getBcsIndex(bcsEngineType)];
bcsContainers.lastSignalledPacket.swapNodes(dummyContainer);
}
}
void CommandQueue::fillCsrDependenciesWithLastBcsPackets(CsrDependencies &csrDeps) {
for (BcsTimestampPacketContainers &bcsContainers : bcsTimestampPacketContainers) {
if (bcsContainers.lastSignalledPacket.peekNodes().empty()) {
continue;
}
csrDeps.timestampPacketContainer.push_back(&bcsContainers.lastSignalledPacket);
}
}
void CommandQueue::clearLastBcsPackets() {
for (BcsTimestampPacketContainers &bcsContainers : bcsTimestampPacketContainers) {
bcsContainers.lastSignalledPacket.moveNodesToNewContainer(*deferredTimestampPackets);
}
}
} // namespace NEO

View File

@@ -324,6 +324,12 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
void updateLatestSentEnqueueType(EnqueueProperties::Operation newEnqueueType) { this->latestSentEnqueueType = newEnqueueType; }
EnqueueProperties::Operation peekLatestSentEnqueueOperation() { return this->latestSentEnqueueType; }
void setupBarrierTimestampForBcsEngines(aub_stream::EngineType engineType, TimestampPacketDependencies &timestampPacketDependencies);
void processBarrierTimestampForBcsEngine(aub_stream::EngineType bcsEngineType, TimestampPacketDependencies &blitDependencies);
void setLastBcsPacket(aub_stream::EngineType bcsEngineType);
void fillCsrDependenciesWithLastBcsPackets(CsrDependencies &csrDeps);
void clearLastBcsPackets();
// taskCount of last task
uint32_t taskCount = 0;
@@ -409,6 +415,11 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
std::unique_ptr<TimestampPacketContainer> deferredTimestampPackets;
std::unique_ptr<TimestampPacketContainer> timestampPacketContainer;
struct BcsTimestampPacketContainers {
TimestampPacketContainer lastBarrierToWaitFor;
TimestampPacketContainer lastSignalledPacket;
};
std::array<BcsTimestampPacketContainers, bcsInfoMaskSize> bcsTimestampPacketContainers;
};
template <typename PtrType>

View File

@@ -248,6 +248,10 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
timestampPacketDependencies, eventsRequest, blockQueue);
}
if (!blockQueue && isOOQEnabled()) {
setupBarrierTimestampForBcsEngines(computeCommandStreamReceiver.getOsContext().getEngineType(), timestampPacketDependencies);
}
if (eventBuilder.getEvent() && computeCommandStreamReceiver.peekTimestampPacketWriteEnabled()) {
eventBuilder.getEvent()->addTimestampPacketNodes(*timestampPacketContainer);
eventBuilder.getEvent()->addTimestampPacketNodes(timestampPacketDependencies.nonAuxToAuxNodes);
@@ -536,8 +540,6 @@ BlitProperties CommandQueueHw<GfxFamily>::processDispatchForBlitEnqueue(CommandS
hwInfo,
args);
}
TimestampPacketHelper::programSemaphore<GfxFamily>(*commandStream, *currentTimestampPacketNode);
}
return blitProperties;
}
@@ -893,8 +895,13 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueNonBlocked(
dispatchFlags.pipelineSelectArgs.mediaSamplerRequired = mediaSamplerRequired;
dispatchFlags.pipelineSelectArgs.specialPipelineSelectMode = specialPipelineSelectMode;
const bool isHandlingBarrier = getGpgpuCommandStreamReceiver().isStallingCommandsOnNextFlushRequired();
if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled() && !clearDependenciesForSubCapture) {
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(dispatchFlags.csrDependencies, getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OutOfCsr);
if (isHandlingBarrier) {
fillCsrDependenciesWithLastBcsPackets(dispatchFlags.csrDependencies);
}
dispatchFlags.csrDependencies.makeResident(getGpgpuCommandStreamReceiver());
}
@@ -932,6 +939,10 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueNonBlocked(
dispatchFlags,
getDevice());
if (isHandlingBarrier) {
clearLastBcsPackets();
}
if (gtpinIsGTPinInitialized()) {
gtpinNotifyFlushTask(completionStamp.taskCount);
}
@@ -1114,8 +1125,13 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueCommandWithoutKernel(
false, //memoryMigrationRequired
false); //textureCacheFlush
const bool isHandlingBarrier = getGpgpuCommandStreamReceiver().isStallingCommandsOnNextFlushRequired();
if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(dispatchFlags.csrDependencies, getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OutOfCsr);
if (isHandlingBarrier) {
fillCsrDependenciesWithLastBcsPackets(dispatchFlags.csrDependencies);
}
dispatchFlags.csrDependencies.makeResident(getGpgpuCommandStreamReceiver());
}
@@ -1128,6 +1144,10 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueCommandWithoutKernel(
taskLevel,
dispatchFlags,
getDevice());
if (isHandlingBarrier) {
clearLastBcsPackets();
}
}
if (enqueueProperties.operation == EnqueueProperties::Operation::Blit) {
@@ -1203,9 +1223,10 @@ void CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDispat
timestampPacketDependencies.cacheFlushNodes.add(allocator->getTag());
}
if (!blockQueue && getGpgpuCommandStreamReceiver().isStallingCommandsOnNextFlushRequired()) {
timestampPacketDependencies.barrierNodes.add(allocator->getTag());
if (!blockQueue) {
setupBarrierTimestampForBcsEngines(bcsCsr.getOsContext().getEngineType(), timestampPacketDependencies);
}
processBarrierTimestampForBcsEngine(bcsCsr.getOsContext().getEngineType(), timestampPacketDependencies);
obtainNewTimestampPacketNodes(1, timestampPacketDependencies.previousEnqueueNodes, clearAllDependencies, bcsCsr);
csrDeps.timestampPacketContainer.push_back(&timestampPacketDependencies.previousEnqueueNodes);
@@ -1238,6 +1259,8 @@ void CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDispat
}
this->latestSentEnqueueType = enqueueProperties.operation;
setLastBcsPacket(bcsCsr.getOsContext().getEngineType());
}
updateFromCompletionStamp(completionStamp, eventBuilder.getEvent());

View File

@@ -217,6 +217,10 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate
commandQueue.getGpgpuCommandStreamReceiver(), *bcsCsrForAuxTranslation);
}
if (timestampPacketDependencies && commandQueue.isOOQEnabled()) {
commandQueue.setupBarrierTimestampForBcsEngines(commandQueue.getGpgpuCommandStreamReceiver().getOsContext().getEngineType(), *timestampPacketDependencies);
}
const auto &kernelDescriptor = kernel->getKernelInfo().kernelDescriptor;
auto memoryCompressionState = commandStreamReceiver.getMemoryCompressionState(kernel->isAuxTranslationRequired(), commandQueue.getDevice().getHardwareInfo());
@@ -256,8 +260,13 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate
eventsRequest.fillCsrDependenciesForTaskCountContainer(dispatchFlags.csrDependencies, commandStreamReceiver);
}
const bool isHandlingBarrier = commandQueue.getGpgpuCommandStreamReceiver().isStallingCommandsOnNextFlushRequired();
if (timestampPacketDependencies) {
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(dispatchFlags.csrDependencies, commandStreamReceiver, CsrDependencies::DependenciesType::OutOfCsr);
if (isHandlingBarrier) {
commandQueue.fillCsrDependenciesWithLastBcsPackets(dispatchFlags.csrDependencies);
}
dispatchFlags.barrierTimestampPacketNodes = &timestampPacketDependencies->barrierNodes;
}
dispatchFlags.pipelineSelectArgs.specialPipelineSelectMode = kernel->requiresSpecialPipelineSelectMode();
@@ -291,6 +300,10 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate
dispatchFlags,
commandQueue.getDevice());
if (isHandlingBarrier) {
commandQueue.clearLastBcsPackets();
}
if (kernelOperation->blitPropertiesContainer.size() > 0) {
const auto newTaskCount = bcsCsrForAuxTranslation->blitBuffer(kernelOperation->blitPropertiesContainer, false, commandQueue.isProfilingEnabled(), commandQueue.getDevice());
commandQueue.updateBcsTaskCount(bcsCsrForAuxTranslation->getOsContext().getEngineType(), newTaskCount);
@@ -332,6 +345,7 @@ void CommandWithoutKernel::dispatchBlitOperation() {
const auto newTaskCount = bcsCsr->blitBuffer(kernelOperation->blitPropertiesContainer, false, commandQueue.isProfilingEnabled(), commandQueue.getDevice());
commandQueue.updateBcsTaskCount(bcsCsr->getOsContext().getEngineType(), newTaskCount);
commandQueue.setLastBcsPacket(bcsCsr->getOsContext().getEngineType());
}
CompletionStamp &CommandWithoutKernel::submit(uint32_t taskLevel, bool terminated) {
@@ -364,6 +378,10 @@ CompletionStamp &CommandWithoutKernel::submit(uint32_t taskLevel, bool terminate
}
}
if (timestampPacketDependencies && commandQueue.isOOQEnabled()) {
commandQueue.setupBarrierTimestampForBcsEngines(commandQueue.getGpgpuCommandStreamReceiver().getOsContext().getEngineType(), *timestampPacketDependencies);
}
auto rootDeviceIndex = commandStreamReceiver.getRootDeviceIndex();
DispatchFlags dispatchFlags(
{}, //csrDependencies
@@ -400,8 +418,13 @@ CompletionStamp &CommandWithoutKernel::submit(uint32_t taskLevel, bool terminate
eventsRequest.fillCsrDependenciesForTaskCountContainer(dispatchFlags.csrDependencies, commandStreamReceiver);
}
const bool isHandlingBarrier = commandQueue.getGpgpuCommandStreamReceiver().isStallingCommandsOnNextFlushRequired();
if (commandStreamReceiver.peekTimestampPacketWriteEnabled()) {
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(dispatchFlags.csrDependencies, commandStreamReceiver, CsrDependencies::DependenciesType::OutOfCsr);
if (isHandlingBarrier) {
commandQueue.fillCsrDependenciesWithLastBcsPackets(dispatchFlags.csrDependencies);
}
makeTimestampPacketsResident(commandStreamReceiver);
}
@@ -416,6 +439,10 @@ CompletionStamp &CommandWithoutKernel::submit(uint32_t taskLevel, bool terminate
dispatchFlags,
commandQueue.getDevice());
if (isHandlingBarrier) {
commandQueue.clearLastBcsPackets();
}
if (kernelOperation->blitEnqueue) {
dispatchBlitOperation();
}