Feature(OCL) Use tag nodes for root device synchronization

With this commit events created on multi root device contexts will
synchronize using signaled TagNodes instead of using taskCounts.

Signed-off-by: Maciej Plewka <maciej.plewka@intel.com>

Related-To: NEO-7105
This commit is contained in:
Maciej Plewka
2022-12-07 11:07:43 +00:00
committed by Compute-Runtime-Automation
parent 6fac234655
commit 547d1c37b3
62 changed files with 995 additions and 210 deletions

View File

@@ -20,7 +20,6 @@ namespace NEO {
void flushDependentCsr(CommandStreamReceiver &dependentCsr, CsrDependencies &csrDeps) {
auto csrOwnership = dependentCsr.obtainUniqueOwnership();
dependentCsr.updateTagFromWait();
csrDeps.taskCountContainer.push_back({dependentCsr.peekTaskCount(), reinterpret_cast<uint64_t>(dependentCsr.getTagAddress())});
}
void EventsRequest::fillCsrDependenciesForTimestampPacketContainer(CsrDependencies &csrDeps, CommandStreamReceiver &currentCsr, CsrDependencies::DependenciesType depsType) const {
@@ -54,6 +53,7 @@ void EventsRequest::fillCsrDependenciesForTimestampPacketContainer(CsrDependenci
if (productHelper.isDcFlushAllowed()) {
if (!dependentCsr.isLatestTaskCountFlushed()) {
flushDependentCsr(dependentCsr, csrDeps);
//csrDeps.taskCountContainer.push_back({dependentCsr.peekTaskCount(), reinterpret_cast<uint64_t>(dependentCsr.getTagAddress())});
currentCsr.makeResident(*dependentCsr.getTagAllocation());
}
}
@@ -62,23 +62,22 @@ void EventsRequest::fillCsrDependenciesForTimestampPacketContainer(CsrDependenci
}
}
void EventsRequest::fillCsrDependenciesForTaskCountContainer(CsrDependencies &csrDeps, CommandStreamReceiver &currentCsr) const {
void EventsRequest::fillCsrDependenciesForRootDevices(CsrDependencies &csrDeps, CommandStreamReceiver &currentCsr) const {
for (cl_uint i = 0; i < this->numEventsInWaitList; i++) {
auto event = castToObjectOrAbort<Event>(this->eventWaitList[i]);
if (event->isUserEvent() || CompletionStamp::notReady == event->peekTaskCount()) {
continue;
}
if (event->getCommandQueue() && event->getCommandQueue()->getDevice().getRootDeviceIndex() != currentCsr.getRootDeviceIndex()) {
auto timestampPacketContainer = event->getMultiRootDeviceTimestampPacketNodes();
if (!timestampPacketContainer || timestampPacketContainer->peekNodes().empty()) {
continue;
}
auto &dependentCsr = event->getCommandQueue()->getGpgpuCommandStreamReceiver();
if (!dependentCsr.isLatestTaskCountFlushed()) {
flushDependentCsr(dependentCsr, csrDeps);
} else {
csrDeps.taskCountContainer.push_back({event->peekTaskCount(), reinterpret_cast<uint64_t>(dependentCsr.getTagAddress())});
}
auto graphicsAllocation = event->getCommandQueue()->getGpgpuCommandStreamReceiver().getTagsMultiAllocation()->getGraphicsAllocation(currentCsr.getRootDeviceIndex());
currentCsr.getResidencyAllocations().push_back(graphicsAllocation);
csrDeps.multiRootTimeStampSyncContainer.push_back(timestampPacketContainer);
}
}
}

View File

@@ -25,7 +25,7 @@ struct EventsRequest {
: numEventsInWaitList(numEventsInWaitList), eventWaitList(eventWaitList), outEvent(outEvent) {}
void fillCsrDependenciesForTimestampPacketContainer(CsrDependencies &csrDeps, CommandStreamReceiver &currentCsr, CsrDependencies::DependenciesType depsType) const;
void fillCsrDependenciesForTaskCountContainer(CsrDependencies &csrDeps, CommandStreamReceiver &currentCsr) const;
void fillCsrDependenciesForRootDevices(CsrDependencies &csrDeps, CommandStreamReceiver &currentCsr) const;
void setupBcsCsrForOutputEvent(CommandStreamReceiver &bcsCsr) const;
cl_uint numEventsInWaitList;

View File

@@ -117,10 +117,11 @@ CompletionStamp &CommandMapUnmap::submit(TaskCountType taskLevel, bool terminate
CommandComputeKernel::CommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr<KernelOperation> &kernelOperation, std::vector<Surface *> surfaces,
bool flushDC, bool usesSLM, uint32_t commandType, std::unique_ptr<PrintfHandler> &&printfHandler,
PreemptionMode preemptionMode, Kernel *kernel, uint32_t kernelCount)
PreemptionMode preemptionMode, Kernel *kernel, uint32_t kernelCount,
TagNodeBase *multiRootDeviceSyncNode)
: Command(commandQueue, kernelOperation), surfaces(std::move(surfaces)), flushDC(flushDC), slmUsed(usesSLM),
commandType(commandType), printfHandler(std::move(printfHandler)), kernel(kernel),
kernelCount(kernelCount), preemptionMode(preemptionMode) {
kernelCount(kernelCount), preemptionMode(preemptionMode), multiRootDeviceSyncNode(multiRootDeviceSyncNode) {
UNRECOVERABLE_IF(nullptr == this->kernel);
kernel->incRefInternal();
}
@@ -162,6 +163,9 @@ CompletionStamp &CommandComputeKernel::submit(TaskCountType taskLevel, bool term
printfHandler->makeResident(commandStreamReceiver);
}
makeTimestampPacketsResident(commandStreamReceiver);
if (multiRootDeviceSyncNode != nullptr) {
commandStreamReceiver.makeResident(*multiRootDeviceSyncNode->getBaseGraphicsAllocation());
}
if (kernelOperation->blitPropertiesContainer.size() > 0) {
CsrDependencies csrDeps;
@@ -213,7 +217,7 @@ CompletionStamp &CommandComputeKernel::submit(TaskCountType taskLevel, bool term
false); // hasRelaxedOrderingDependencies
if (commandQueue.getContext().getRootDeviceIndices().size() > 1) {
eventsRequest.fillCsrDependenciesForTaskCountContainer(dispatchFlags.csrDependencies, commandStreamReceiver);
eventsRequest.fillCsrDependenciesForRootDevices(dispatchFlags.csrDependencies, commandStreamReceiver);
}
const bool isHandlingBarrier = commandQueue.getGpgpuCommandStreamReceiver().isStallingCommandsOnNextFlushRequired();
@@ -306,7 +310,7 @@ TaskCountType CommandWithoutKernel::dispatchBlitOperation() {
blitProperties.outputTimestampPacket = currentTimestampPacketNodes->peekNodes()[0];
if (commandQueue.getContext().getRootDeviceIndices().size() > 1) {
eventsRequest.fillCsrDependenciesForTaskCountContainer(blitProperties.csrDependencies, *bcsCsr);
eventsRequest.fillCsrDependenciesForRootDevices(blitProperties.csrDependencies, *bcsCsr);
}
const auto newTaskCount = bcsCsr->flushBcsTask(kernelOperation->blitPropertiesContainer, false, commandQueue.isProfilingEnabled(), commandQueue.getDevice());
@@ -388,7 +392,7 @@ CompletionStamp &CommandWithoutKernel::submit(TaskCountType taskLevel, bool term
false); // hasRelaxedOrderingDependencies
if (commandQueue.getContext().getRootDeviceIndices().size() > 1) {
eventsRequest.fillCsrDependenciesForTaskCountContainer(dispatchFlags.csrDependencies, commandStreamReceiver);
eventsRequest.fillCsrDependenciesForRootDevices(dispatchFlags.csrDependencies, commandStreamReceiver);
}
const bool isHandlingBarrier = commandQueue.getGpgpuCommandStreamReceiver().isStallingCommandsOnNextFlushRequired();

View File

@@ -131,7 +131,7 @@ class CommandComputeKernel : public Command {
public:
CommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr<KernelOperation> &kernelOperation, std::vector<Surface *> surfaces,
bool flushDC, bool usesSLM, uint32_t commandType, std::unique_ptr<PrintfHandler> &&printfHandler,
PreemptionMode preemptionMode, Kernel *kernel, uint32_t kernelCount);
PreemptionMode preemptionMode, Kernel *kernel, uint32_t kernelCount, TagNodeBase *multiRootDeviceSyncNode);
~CommandComputeKernel() override;
@@ -150,6 +150,7 @@ class CommandComputeKernel : public Command {
Kernel *kernel;
uint32_t kernelCount;
PreemptionMode preemptionMode;
TagNodeBase *multiRootDeviceSyncNode;
};
class CommandWithoutKernel : public Command {