diff --git a/opencl/source/command_queue/command_queue.cpp b/opencl/source/command_queue/command_queue.cpp index f94a55679c..1bbdfd00b3 100644 --- a/opencl/source/command_queue/command_queue.cpp +++ b/opencl/source/command_queue/command_queue.cpp @@ -104,7 +104,9 @@ CommandQueue::CommandQueue(Context *context, ClDevice *device, const cl_queue_pr timestampPacketContainer = std::make_unique(); deferredTimestampPackets = std::make_unique(); } - + if (context && context->getRootDeviceIndices().size() > 1) { + deferredMultiRootSyncNodes = std::make_unique(); + } auto deferCmdQBcsInitialization = hwInfo.featureTable.ftrBcsInfo.count() > 1u; if (DebugManager.flags.DeferCmdQBcsInitialization.get() != -1) { @@ -1248,6 +1250,10 @@ WaitStatus CommandQueue::waitForAllEngines(bool blockedQueue, PrintfHandler *pri if (deferredTimestampPackets) { deferredTimestampPackets->swapNodes(nodesToRelease); } + TimestampPacketContainer multiRootSyncNodesToRelease; + if (deferredMultiRootSyncNodes.get()) { + deferredMultiRootSyncNodes->swapNodes(multiRootSyncNodesToRelease); + } waitStatus = waitUntilComplete(taskCount, activeBcsStates, flushStamp->peekStamp(), false, cleanTemporaryAllocationsList, waitedOnTimestamps); diff --git a/opencl/source/command_queue/command_queue.h b/opencl/source/command_queue/command_queue.h index 8d9a210b00..c55d0544cd 100644 --- a/opencl/source/command_queue/command_queue.h +++ b/opencl/source/command_queue/command_queue.h @@ -439,6 +439,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> { bool requiresCacheFlushAfterWalker = false; std::unique_ptr deferredTimestampPackets; + std::unique_ptr deferredMultiRootSyncNodes; std::unique_ptr timestampPacketContainer; struct BcsTimestampPacketContainers { diff --git a/opencl/source/command_queue/command_queue_hw.h b/opencl/source/command_queue/command_queue_hw.h index 5679c2f616..d9225c3727 100644 --- a/opencl/source/command_queue/command_queue_hw.h +++ b/opencl/source/command_queue/command_queue_hw.h @@ -391,7 +391,8 @@ class CommandQueueHw : public CommandQueue { EventsRequest &eventsRequest, EventBuilder &externalEventBuilder, std::unique_ptr &&printfHandler, - CommandStreamReceiver *bcsCsr); + CommandStreamReceiver *bcsCsr, + TagNodeBase *multiRootDeviceSyncNode); CompletionStamp enqueueCommandWithoutKernel(Surface **surfaces, size_t surfaceCount, @@ -422,7 +423,7 @@ class CommandQueueHw : public CommandQueue { TimestampPacketDependencies ×tampPacketDependencies, const EventsRequest &eventsRequest, LinearStream *commandStream, - uint32_t commandType, bool queueBlocked); + uint32_t commandType, bool queueBlocked, TagNodeBase *multiRootDeviceEventSync); void submitCacheFlush(Surface **surfaces, size_t numSurfaces, LinearStream *commandStream, @@ -433,6 +434,8 @@ class CommandQueueHw : public CommandQueue { bool waitForTimestamps(Range copyEnginesToWait, TaskCountType taskCount, WaitStatus &status, TimestampPacketContainer *mainContainer, TimestampPacketContainer *deferredContainer) override; MOCKABLE_VIRTUAL bool isCacheFlushForBcsRequired() const; + MOCKABLE_VIRTUAL void processSignalMultiRootDeviceNode(LinearStream *commandStream, + TagNodeBase *node); protected: MOCKABLE_VIRTUAL void enqueueHandlerHook(const unsigned int commandType, const MultiDispatchInfo &dispatchInfo){}; @@ -473,7 +476,7 @@ class CommandQueueHw : public CommandQueue { blockedCommandsData = std::make_unique(commandStream, *gpgpuCsr.getInternalAllocationStorage()); } else { commandStream = &getCommandStream(*this, csrDependencies, profilingRequired, perfCountersRequired, - blitEnqueue, multiDispatchInfo, surfaces, numSurfaces, isMarkerWithProfiling, eventsRequest.numEventsInWaitList > 0); + blitEnqueue, multiDispatchInfo, surfaces, numSurfaces, isMarkerWithProfiling, eventsRequest.numEventsInWaitList > 0, eventsRequest.outEvent); } return commandStream; } diff --git a/opencl/source/command_queue/command_queue_hw_xehp_and_later.inl b/opencl/source/command_queue/command_queue_hw_xehp_and_later.inl index f2f9bb590a..27b00dd488 100644 --- a/opencl/source/command_queue/command_queue_hw_xehp_and_later.inl +++ b/opencl/source/command_queue/command_queue_hw_xehp_and_later.inl @@ -45,7 +45,7 @@ bool CommandQueueHw::isCacheFlushCommand(uint32_t commandType) const { } template <> -LinearStream &getCommandStream(CommandQueue &commandQueue, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, bool blitEnqueue, const MultiDispatchInfo &multiDispatchInfo, Surface **surfaces, size_t numSurfaces, bool isMarkerWithProfiling, bool eventsInWaitList) { +LinearStream &getCommandStream(CommandQueue &commandQueue, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, bool blitEnqueue, const MultiDispatchInfo &multiDispatchInfo, Surface **surfaces, size_t numSurfaces, bool isMarkerWithProfiling, bool eventsInWaitList, cl_event *outEvent) { size_t expectedSizeCS = 0; [[maybe_unused]] bool usePostSync = false; if (commandQueue.getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h index 554f70cbcd..a9bf384746 100644 --- a/opencl/source/command_queue/enqueue_common.h +++ b/opencl/source/command_queue/enqueue_common.h @@ -45,6 +45,7 @@ #include namespace NEO { +struct RootDeviceEnvironment; template template @@ -178,7 +179,7 @@ cl_int CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, BlitPropertiesContainer blitPropertiesContainer; if (this->context->getRootDeviceIndices().size() > 1) { - eventsRequest.fillCsrDependenciesForTaskCountContainer(csrDeps, computeCommandStreamReceiver); + eventsRequest.fillCsrDependenciesForRootDevices(csrDeps, computeCommandStreamReceiver); } const bool enqueueWithBlitAuxTranslation = isBlitAuxTranslationRequired(multiDispatchInfo); @@ -227,7 +228,7 @@ cl_int CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, } if (this->context->getRootDeviceIndices().size() > 1) { - TimestampPacketHelper::programCsrDependenciesForForTaskCountContainer(commandStream, csrDeps); + TimestampPacketHelper::programCsrDependenciesForForMultiRootDeviceSyncContainer(commandStream, csrDeps); } if (enqueueWithBlitAuxTranslation) { @@ -281,6 +282,18 @@ cl_int CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, } else if (isMarkerWithPostSyncWrite) { processDispatchForMarker(*this, &commandStream, eventsRequest, csrDeps); } + TagNodeBase *multiRootEventSyncStamp = nullptr; + if (eventBuilder.getEvent() && eventBuilder.getEvent()->getContext()->getRootDeviceIndices().size() > 1 && + !(multiDispatchInfo.empty() && CL_COMMAND_MARKER != commandType)) { + multiRootEventSyncStamp = eventBuilder.getEvent()->getMultiRootTimestampSyncNode(); + if (!blockQueue) { + this->getGpgpuCommandStreamReceiver().makeResident(*multiRootEventSyncStamp->getBaseGraphicsAllocation()); + } + processSignalMultiRootDeviceNode(&commandStream, multiRootEventSyncStamp); + if (CL_COMMAND_MARKER == commandType) { + flushDependenciesForNonKernelCommand = true; + } + } CompletionStamp completionStamp = {CompletionStamp::notReady, taskLevel, 0}; const EnqueueProperties enqueueProperties(false, !multiDispatchInfo.empty(), isCacheFlushCommand(commandType), @@ -383,13 +396,17 @@ cl_int CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, eventsRequest, eventBuilder, std::move(printfHandler), - nullptr); + nullptr, + multiRootEventSyncStamp); } if (deferredTimestampPackets.get()) { timestampPacketDependencies.moveNodesToNewContainer(*deferredTimestampPackets); csrDeps.copyNodesToNewContainer(*deferredTimestampPackets); } + if (deferredMultiRootSyncNodes.get()) { + csrDeps.copyRootDeviceSyncNodesToNewContainer(*deferredMultiRootSyncNodes); + } commandStreamReceiverOwnership.unlock(); queueOwnership.unlock(); @@ -498,7 +515,7 @@ BlitProperties CommandQueueHw::processDispatchForBlitEnqueue(CommandS const MultiDispatchInfo &multiDispatchInfo, TimestampPacketDependencies ×tampPacketDependencies, const EventsRequest &eventsRequest, LinearStream *commandStream, - uint32_t commandType, bool queueBlocked) { + uint32_t commandType, bool queueBlocked, TagNodeBase *multiRootDeviceEventSync) { auto blitDirection = ClBlitProperties::obtainBlitDirection(commandType); auto blitProperties = ClBlitProperties::constructProperties(blitDirection, blitCommandStreamReceiver, @@ -511,7 +528,7 @@ BlitProperties CommandQueueHw::processDispatchForBlitEnqueue(CommandS blitProperties.csrDependencies.timestampPacketContainer.push_back(×tampPacketDependencies.previousEnqueueNodes); blitProperties.csrDependencies.timestampPacketContainer.push_back(×tampPacketDependencies.barrierNodes); } - + blitProperties.multiRootDeviceEventSync = multiRootDeviceEventSync; auto currentTimestampPacketNode = timestampPacketContainer->peekNodes().at(0); blitProperties.outputTimestampPacket = currentTimestampPacketNode; @@ -617,7 +634,20 @@ void CommandQueueHw::processDispatchForMarker(CommandQueue &commandQu HardwareInterface::dispatchProfilingPerfEndCommands(hwTimeStamps, hwPerfCounter, commandStream, commandQueue); getGpgpuCommandStreamReceiver().makeResident(*hwTimeStamps->getBaseGraphicsAllocation()); } - +template +void CommandQueueHw::processSignalMultiRootDeviceNode(LinearStream *commandStream, + TagNodeBase *node) { + const auto &hwInfo = getDevice().getHardwareInfo(); + PipeControlArgs args; + args.dcFlushEnable = MemorySynchronizationCommands::getDcFlushEnable(true, device->getRootDeviceEnvironment()); + MemorySynchronizationCommands::addBarrierWithPostSyncOperation( + *commandStream, + PostSyncMode::ImmediateData, + node->getGpuAddress() + node->getContextEndOffset(), + std::numeric_limits::max(), + hwInfo, + args); +} template void CommandQueueHw::processDispatchForMarkerWithTimestampPacket(CommandQueue &commandQueue, LinearStream *commandStream, @@ -903,7 +933,8 @@ void CommandQueueHw::enqueueBlocked( EventsRequest &eventsRequest, EventBuilder &externalEventBuilder, std::unique_ptr &&printfHandler, - CommandStreamReceiver *bcsCsr) { + CommandStreamReceiver *bcsCsr, + TagNodeBase *multiRootDeviceSyncNode) { TakeOwnershipWrapper> queueOwnership(*this); @@ -974,7 +1005,8 @@ void CommandQueueHw::enqueueBlocked( std::move(printfHandler), preemptionMode, multiDispatchInfo.peekMainKernel(), - (uint32_t)multiDispatchInfo.size()); + (uint32_t)multiDispatchInfo.size(), + multiRootDeviceSyncNode); } if (storeTimestampPackets) { command->setTimestampPacketNode(*timestampPacketContainer, std::move(timestampPacketDependencies)); @@ -1281,10 +1313,14 @@ cl_int CommandQueueHw::enqueueBlit(const MultiDispatchInfo &multiDisp } TimestampPacketDependencies timestampPacketDependencies; + TagNodeBase *multiRootEventSyncStamp = nullptr; BlitPropertiesContainer blitPropertiesContainer; CsrDependencies csrDeps; eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, bcsCsr, CsrDependencies::DependenciesType::All); + if (this->context->getRootDeviceIndices().size() > 1) { + eventsRequest.fillCsrDependenciesForRootDevices(csrDeps, bcsCsr); + } auto allocator = bcsCsr.getTimestampPacketAllocator(); if (!blockQueue) { @@ -1311,6 +1347,10 @@ cl_int CommandQueueHw::enqueueBlit(const MultiDispatchInfo &multiDisp if (eventBuilder.getEvent()) { eventBuilder.getEvent()->addTimestampPacketNodes(*timestampPacketContainer); } + if (eventBuilder.getEvent() && eventBuilder.getEvent()->getContext()->getRootDeviceIndices().size() > 1) { + multiRootEventSyncStamp = eventBuilder.getEvent()->getMultiRootTimestampSyncNode(); + bcsCsr.makeResident(*multiRootEventSyncStamp->getBaseGraphicsAllocation()); + } CompletionStamp completionStamp = {CompletionStamp::notReady, taskLevel, 0}; @@ -1327,7 +1367,7 @@ cl_int CommandQueueHw::enqueueBlit(const MultiDispatchInfo &multiDisp } blitPropertiesContainer.push_back(processDispatchForBlitEnqueue(bcsCsr, multiDispatchInfo, timestampPacketDependencies, - eventsRequest, gpgpuCommandStream, cmdType, blockQueue)); + eventsRequest, gpgpuCommandStream, cmdType, blockQueue, multiRootEventSyncStamp)); if (!blockQueue) { completionStamp = enqueueCommandWithoutKernel(nullptr, 0, gpgpuCommandStream, gpgpuCommandStreamStart, blocking, @@ -1354,7 +1394,7 @@ cl_int CommandQueueHw::enqueueBlit(const MultiDispatchInfo &multiDisp updateFromCompletionStamp(completionStamp, eventBuilder.getEvent()); if (blockQueue) { - enqueueBlocked(cmdType, nullptr, 0, multiDispatchInfo, timestampPacketDependencies, blockedCommandsData, enqueueProperties, eventsRequest, eventBuilder, nullptr, &bcsCsr); + enqueueBlocked(cmdType, nullptr, 0, multiDispatchInfo, timestampPacketDependencies, blockedCommandsData, enqueueProperties, eventsRequest, eventBuilder, nullptr, &bcsCsr, multiRootEventSyncStamp); if (gpgpuSubmission) { if (DebugManager.flags.ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission.get() == 1) { @@ -1365,6 +1405,9 @@ cl_int CommandQueueHw::enqueueBlit(const MultiDispatchInfo &multiDisp timestampPacketDependencies.moveNodesToNewContainer(*deferredTimestampPackets); csrDeps.copyNodesToNewContainer(*deferredTimestampPackets); + if (deferredMultiRootSyncNodes.get()) { + csrDeps.copyRootDeviceSyncNodesToNewContainer(*deferredMultiRootSyncNodes); + } if (DebugManager.flags.ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission.get() != 1) { commandStreamReceiverOwnership.unlock(); } diff --git a/opencl/source/command_queue/gpgpu_walker.h b/opencl/source/command_queue/gpgpu_walker.h index e22858c851..a7ae002f44 100644 --- a/opencl/source/command_queue/gpgpu_walker.h +++ b/opencl/source/command_queue/gpgpu_walker.h @@ -88,7 +88,7 @@ class GpgpuWalkerHelper { template struct EnqueueOperation { using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL; - static size_t getTotalSizeRequiredCS(uint32_t eventType, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace, bool reservePerfCounters, bool blitEnqueue, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo, bool isMarkerWithProfiling, bool eventsInWaitList); + static size_t getTotalSizeRequiredCS(uint32_t eventType, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace, bool reservePerfCounters, bool blitEnqueue, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo, bool isMarkerWithProfiling, bool eventsInWaitList, cl_event *outEvent); static size_t getSizeRequiredCS(uint32_t cmdType, bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel, const DispatchInfo &dispatchInfo); static size_t getSizeRequiredForTimestampPacketWrite(); static size_t getSizeForCacheFlushAfterWalkerCommands(const Kernel &kernel, const CommandQueue &commandQueue); @@ -101,8 +101,8 @@ struct EnqueueOperation { template LinearStream &getCommandStream(CommandQueue &commandQueue, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, bool blitEnqueue, const MultiDispatchInfo &multiDispatchInfo, - Surface **surfaces, size_t numSurfaces, bool isMarkerWithProfiling, bool eventsInWaitList) { - size_t expectedSizeCS = EnqueueOperation::getTotalSizeRequiredCS(eventType, csrDeps, reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, blitEnqueue, commandQueue, multiDispatchInfo, isMarkerWithProfiling, eventsInWaitList); + Surface **surfaces, size_t numSurfaces, bool isMarkerWithProfiling, bool eventsInWaitList, cl_event *outEvent) { + size_t expectedSizeCS = EnqueueOperation::getTotalSizeRequiredCS(eventType, csrDeps, reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, blitEnqueue, commandQueue, multiDispatchInfo, isMarkerWithProfiling, eventsInWaitList, outEvent); return commandQueue.getCS(expectedSizeCS); } diff --git a/opencl/source/command_queue/gpgpu_walker_base.inl b/opencl/source/command_queue/gpgpu_walker_base.inl index 66b6096611..6780ac7f57 100644 --- a/opencl/source/command_queue/gpgpu_walker_base.inl +++ b/opencl/source/command_queue/gpgpu_walker_base.inl @@ -166,7 +166,7 @@ size_t GpgpuWalkerHelper::getSizeForWaDisableRccRhwoOptimization(cons } template -size_t EnqueueOperation::getTotalSizeRequiredCS(uint32_t eventType, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace, bool reservePerfCounters, bool blitEnqueue, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo, bool isMarkerWithProfiling, bool eventsInWaitlist) { +size_t EnqueueOperation::getTotalSizeRequiredCS(uint32_t eventType, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace, bool reservePerfCounters, bool blitEnqueue, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo, bool isMarkerWithProfiling, bool eventsInWaitlist, cl_event *outEvent) { size_t expectedSizeCS = 0; auto &hwInfo = commandQueue.getDevice().getHardwareInfo(); auto &gfxCoreHelper = commandQueue.getDevice().getGfxCoreHelper(); @@ -219,8 +219,14 @@ size_t EnqueueOperation::getTotalSizeRequiredCS(uint32_t eventType, c if (DebugManager.flags.GpuScratchRegWriteAfterWalker.get() != -1) { expectedSizeCS += sizeof(typename GfxFamily::MI_LOAD_REGISTER_IMM); } - - expectedSizeCS += TimestampPacketHelper::getRequiredCmdStreamSizeForTaskCountContainer(csrDeps); + expectedSizeCS += TimestampPacketHelper::getRequiredCmdStreamSizeForMultiRootDeviceSyncNodesContainer(csrDeps); + if (outEvent) { + auto pEvent = castToObjectOrAbort(*outEvent); + if ((pEvent->getContext()->getRootDeviceIndices().size() > 1) && (!pEvent->isUserEvent())) { + expectedSizeCS += MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hwInfo, false); + } + } + expectedSizeCS += MemorySynchronizationCommands::getSizeForSingleBarrier(false); return expectedSizeCS; } diff --git a/opencl/source/command_queue/hardware_interface.h b/opencl/source/command_queue/hardware_interface.h index 92c59afcc3..be8ae196c2 100644 --- a/opencl/source/command_queue/hardware_interface.h +++ b/opencl/source/command_queue/hardware_interface.h @@ -33,6 +33,7 @@ struct HardwareInterfaceWalkerArgs { size_t localWorkSizes[3] = {}; TagNodeBase *hwTimeStamps = nullptr; TagNodeBase *hwPerfCounter = nullptr; + TagNodeBase *multiRootDeviceEventStamp = nullptr; TimestampPacketDependencies *timestampPacketDependencies = nullptr; TimestampPacketContainer *currentTimestampPacketNodes = nullptr; const Vec3 *numberOfWorkgroups = nullptr; diff --git a/opencl/source/command_queue/hardware_interface_base.inl b/opencl/source/command_queue/hardware_interface_base.inl index 8f61426f56..ac19321ea9 100644 --- a/opencl/source/command_queue/hardware_interface_base.inl +++ b/opencl/source/command_queue/hardware_interface_base.inl @@ -133,6 +133,7 @@ void HardwareInterface::dispatchWalker( walkerArgs.currentTimestampPacketNodes); walkerArgs.currentDispatchIndex = 0; + for (auto &dispatchInfo : multiDispatchInfo) { dispatchInfo.dispatchInitCommands(*commandStream, walkerArgs.timestampPacketDependencies, commandQueue.getDevice().getRootDeviceEnvironment()); walkerArgs.isMainKernel = (dispatchInfo.getKernel() == mainKernel); diff --git a/opencl/source/context/context.cpp b/opencl/source/context/context.cpp index a725a72e65..7f0236f940 100644 --- a/opencl/source/context/context.cpp +++ b/opencl/source/context/context.cpp @@ -19,6 +19,7 @@ #include "shared/source/memory_manager/memory_manager.h" #include "shared/source/memory_manager/unified_memory_manager.h" #include "shared/source/utilities/heap_allocator.h" +#include "shared/source/utilities/tag_allocator.h" #include "opencl/source/cl_device/cl_device.h" #include "opencl/source/command_queue/command_queue.h" @@ -49,7 +50,9 @@ Context::Context( Context::~Context() { gtpinNotifyContextDestroy((cl_context)this); - + if (multiRootDeviceTimestampPacketAllocator.get() != nullptr) { + multiRootDeviceTimestampPacketAllocator.reset(); + } if (smallBufferPoolAllocator.isAggregatedSmallBuffersEnabled(this)) { smallBufferPoolAllocator.releaseSmallBufferPool(); } @@ -564,5 +567,15 @@ void Context::BufferPoolAllocator::releaseSmallBufferPool() { delete this->mainStorage; this->mainStorage = nullptr; } +TagAllocatorBase *Context::getMultiRootDeviceTimestampPacketAllocator() { + return multiRootDeviceTimestampPacketAllocator.get(); +} +void Context::setMultiRootDeviceTimestampPacketAllocator(std::unique_ptr &allocator) { + multiRootDeviceTimestampPacketAllocator = std::move(allocator); +} + +std::unique_lock Context::obtainOwnershipForMultiRootDeviceAllocator() { + return std::unique_lock(multiRootDeviceAllocatorMtx); +} } // namespace NEO diff --git a/opencl/source/context/context.h b/opencl/source/context/context.h index f06597683a..45d359ecb5 100644 --- a/opencl/source/context/context.h +++ b/opencl/source/context/context.h @@ -37,6 +37,7 @@ class SharingFunctions; class SVMAllocsManager; class Program; class Platform; +class TagAllocatorBase; template <> struct OpenCLObjectMapper<_cl_context> { @@ -223,6 +224,9 @@ class Context : public BaseObject<_cl_context> { BufferPoolAllocator &getBufferPoolAllocator() { return this->smallBufferPoolAllocator; } + TagAllocatorBase *getMultiRootDeviceTimestampPacketAllocator(); + std::unique_lock obtainOwnershipForMultiRootDeviceAllocator(); + void setMultiRootDeviceTimestampPacketAllocator(std::unique_ptr &allocator); protected: struct BuiltInKernel { @@ -263,6 +267,8 @@ class Context : public BaseObject<_cl_context> { uint32_t maxRootDeviceIndex = std::numeric_limits::max(); cl_bool preferD3dSharedResources = 0u; ContextType contextType = ContextType::CONTEXT_TYPE_DEFAULT; + std::unique_ptr multiRootDeviceTimestampPacketAllocator; + std::mutex multiRootDeviceAllocatorMtx; bool interopUserSync = false; bool resolvesRequiredInKernels = false; diff --git a/opencl/source/event/event.cpp b/opencl/source/event/event.cpp index 56c3c1647a..c257518913 100644 --- a/opencl/source/event/event.cpp +++ b/opencl/source/event/event.cpp @@ -884,7 +884,6 @@ TagNodeBase *Event::getHwTimeStampNode() { } TagNodeBase *Event::getHwPerfCounterNode() { - if (!perfCounterNode && cmdQueue->getPerfCounters()) { const uint32_t gpuReportSize = HwPerfCounter::getSize(*(cmdQueue->getPerfCounters())); perfCounterNode = cmdQueue->getGpgpuCommandStreamReceiver().getEventPerfCountAllocator(gpuReportSize)->getTag(); @@ -892,11 +891,27 @@ TagNodeBase *Event::getHwPerfCounterNode() { return perfCounterNode; } +TagNodeBase *Event::getMultiRootTimestampSyncNode() { + auto lock = getContext()->obtainOwnershipForMultiRootDeviceAllocator(); + if (getContext()->getMultiRootDeviceTimestampPacketAllocator() == nullptr) { + auto allocator = cmdQueue->getGpgpuCommandStreamReceiver().createMultiRootDeviceTimestampPacketAllocator(getContext()->getRootDeviceIndices()); + getContext()->setMultiRootDeviceTimestampPacketAllocator(allocator); + } + lock.unlock(); + if (multiRootDeviceTimestampPacketContainer.get() == nullptr) { + multiRootDeviceTimestampPacketContainer = std::make_unique(); + } + multiRootTimeStampSyncNode = getContext()->getMultiRootDeviceTimestampPacketAllocator()->getTag(); + multiRootDeviceTimestampPacketContainer->add(multiRootTimeStampSyncNode); + return multiRootTimeStampSyncNode; +} + void Event::addTimestampPacketNodes(const TimestampPacketContainer &inputTimestampPacketContainer) { timestampPacketContainer->assignAndIncrementNodesRefCounts(inputTimestampPacketContainer); } TimestampPacketContainer *Event::getTimestampPacketNodes() const { return timestampPacketContainer.get(); } +TimestampPacketContainer *Event::getMultiRootDeviceTimestampPacketNodes() const { return multiRootDeviceTimestampPacketContainer.get(); } bool Event::checkUserEventDependencies(cl_uint numEventsInWaitList, const cl_event *eventWaitList) { bool userEventsDependencies = false; diff --git a/opencl/source/event/event.h b/opencl/source/event/event.h index a502fe415d..dd7ed18471 100644 --- a/opencl/source/event/event.h +++ b/opencl/source/event/event.h @@ -115,6 +115,7 @@ class Event : public BaseObject<_cl_event>, public IDNode { void addTimestampPacketNodes(const TimestampPacketContainer &inputTimestampPacketContainer); TimestampPacketContainer *getTimestampPacketNodes() const; + TimestampPacketContainer *getMultiRootDeviceTimestampPacketNodes() const; bool isPerfCountersEnabled() const { return perfCountersEnabled; @@ -129,6 +130,7 @@ class Event : public BaseObject<_cl_event>, public IDNode { } TagNodeBase *getHwPerfCounterNode(); + TagNodeBase *getMultiRootTimestampSyncNode(); std::unique_ptr flushStamp; std::atomic taskLevel; @@ -384,8 +386,10 @@ class Event : public BaseObject<_cl_event>, public IDNode { bool perfCountersEnabled; TagNodeBase *timeStampNode = nullptr; TagNodeBase *perfCounterNode = nullptr; + TagNodeBase *multiRootTimeStampSyncNode = nullptr; std::unique_ptr timestampPacketContainer; // number of events this event depends on + std::unique_ptr multiRootDeviceTimestampPacketContainer; std::atomic parentCount; // event parents std::vector parentEvents; diff --git a/opencl/source/helpers/properties_helper.cpp b/opencl/source/helpers/properties_helper.cpp index 52dcaa6b1a..e34b1ffb00 100644 --- a/opencl/source/helpers/properties_helper.cpp +++ b/opencl/source/helpers/properties_helper.cpp @@ -26,7 +26,6 @@ namespace NEO { void flushDependentCsr(CommandStreamReceiver &dependentCsr, CsrDependencies &csrDeps) { auto csrOwnership = dependentCsr.obtainUniqueOwnership(); dependentCsr.updateTagFromWait(); - csrDeps.taskCountContainer.push_back({dependentCsr.peekTaskCount(), reinterpret_cast(dependentCsr.getTagAddress())}); } void EventsRequest::fillCsrDependenciesForTimestampPacketContainer(CsrDependencies &csrDeps, CommandStreamReceiver ¤tCsr, CsrDependencies::DependenciesType depsType) const { @@ -68,23 +67,22 @@ void EventsRequest::fillCsrDependenciesForTimestampPacketContainer(CsrDependenci } } -void EventsRequest::fillCsrDependenciesForTaskCountContainer(CsrDependencies &csrDeps, CommandStreamReceiver ¤tCsr) const { +void EventsRequest::fillCsrDependenciesForRootDevices(CsrDependencies &csrDeps, CommandStreamReceiver ¤tCsr) const { for (cl_uint i = 0; i < this->numEventsInWaitList; i++) { auto event = castToObjectOrAbort(this->eventWaitList[i]); if (event->isUserEvent() || CompletionStamp::notReady == event->peekTaskCount()) { continue; } - if (event->getCommandQueue() && event->getCommandQueue()->getDevice().getRootDeviceIndex() != currentCsr.getRootDeviceIndex()) { + auto timestampPacketContainer = event->getMultiRootDeviceTimestampPacketNodes(); + if (!timestampPacketContainer || timestampPacketContainer->peekNodes().empty()) { + continue; + } auto &dependentCsr = event->getCommandQueue()->getGpgpuCommandStreamReceiver(); if (!dependentCsr.isLatestTaskCountFlushed()) { flushDependentCsr(dependentCsr, csrDeps); - } else { - csrDeps.taskCountContainer.push_back({event->peekTaskCount(), reinterpret_cast(dependentCsr.getTagAddress())}); } - - auto graphicsAllocation = event->getCommandQueue()->getGpgpuCommandStreamReceiver().getTagsMultiAllocation()->getGraphicsAllocation(currentCsr.getRootDeviceIndex()); - currentCsr.getResidencyAllocations().push_back(graphicsAllocation); + csrDeps.multiRootTimeStampSyncContainer.push_back(timestampPacketContainer); } } } diff --git a/opencl/source/helpers/properties_helper.h b/opencl/source/helpers/properties_helper.h index 37b2c6564b..5a653b18e5 100644 --- a/opencl/source/helpers/properties_helper.h +++ b/opencl/source/helpers/properties_helper.h @@ -25,7 +25,7 @@ struct EventsRequest { : numEventsInWaitList(numEventsInWaitList), eventWaitList(eventWaitList), outEvent(outEvent) {} void fillCsrDependenciesForTimestampPacketContainer(CsrDependencies &csrDeps, CommandStreamReceiver ¤tCsr, CsrDependencies::DependenciesType depsType) const; - void fillCsrDependenciesForTaskCountContainer(CsrDependencies &csrDeps, CommandStreamReceiver ¤tCsr) const; + void fillCsrDependenciesForRootDevices(CsrDependencies &csrDeps, CommandStreamReceiver ¤tCsr) const; void setupBcsCsrForOutputEvent(CommandStreamReceiver &bcsCsr) const; cl_uint numEventsInWaitList; diff --git a/opencl/source/helpers/task_information.cpp b/opencl/source/helpers/task_information.cpp index e91ff96ec8..75dbcf5661 100644 --- a/opencl/source/helpers/task_information.cpp +++ b/opencl/source/helpers/task_information.cpp @@ -120,10 +120,11 @@ CompletionStamp &CommandMapUnmap::submit(TaskCountType taskLevel, bool terminate CommandComputeKernel::CommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr &kernelOperation, std::vector surfaces, bool flushDC, bool usesSLM, uint32_t commandType, std::unique_ptr &&printfHandler, - PreemptionMode preemptionMode, Kernel *kernel, uint32_t kernelCount) + PreemptionMode preemptionMode, Kernel *kernel, uint32_t kernelCount, + TagNodeBase *multiRootDeviceSyncNode) : Command(commandQueue, kernelOperation), surfaces(std::move(surfaces)), flushDC(flushDC), slmUsed(usesSLM), commandType(commandType), printfHandler(std::move(printfHandler)), kernel(kernel), - kernelCount(kernelCount), preemptionMode(preemptionMode) { + kernelCount(kernelCount), preemptionMode(preemptionMode), multiRootDeviceSyncNode(multiRootDeviceSyncNode) { UNRECOVERABLE_IF(nullptr == this->kernel); kernel->incRefInternal(); } @@ -165,6 +166,9 @@ CompletionStamp &CommandComputeKernel::submit(TaskCountType taskLevel, bool term printfHandler->makeResident(commandStreamReceiver); } makeTimestampPacketsResident(commandStreamReceiver); + if (multiRootDeviceSyncNode != nullptr) { + commandStreamReceiver.makeResident(*multiRootDeviceSyncNode->getBaseGraphicsAllocation()); + } if (kernelOperation->blitPropertiesContainer.size() > 0) { CsrDependencies csrDeps; @@ -217,7 +221,7 @@ CompletionStamp &CommandComputeKernel::submit(TaskCountType taskLevel, bool term false); // stateCacheInvalidation if (commandQueue.getContext().getRootDeviceIndices().size() > 1) { - eventsRequest.fillCsrDependenciesForTaskCountContainer(dispatchFlags.csrDependencies, commandStreamReceiver); + eventsRequest.fillCsrDependenciesForRootDevices(dispatchFlags.csrDependencies, commandStreamReceiver); } const bool isHandlingBarrier = commandQueue.getGpgpuCommandStreamReceiver().isStallingCommandsOnNextFlushRequired(); @@ -310,7 +314,7 @@ TaskCountType CommandWithoutKernel::dispatchBlitOperation() { blitProperties.outputTimestampPacket = currentTimestampPacketNodes->peekNodes()[0]; if (commandQueue.getContext().getRootDeviceIndices().size() > 1) { - eventsRequest.fillCsrDependenciesForTaskCountContainer(blitProperties.csrDependencies, *bcsCsr); + eventsRequest.fillCsrDependenciesForRootDevices(blitProperties.csrDependencies, *bcsCsr); } const auto newTaskCount = bcsCsr->flushBcsTask(kernelOperation->blitPropertiesContainer, false, commandQueue.isProfilingEnabled(), commandQueue.getDevice()); @@ -393,7 +397,7 @@ CompletionStamp &CommandWithoutKernel::submit(TaskCountType taskLevel, bool term false); // stateCacheInvalidation if (commandQueue.getContext().getRootDeviceIndices().size() > 1) { - eventsRequest.fillCsrDependenciesForTaskCountContainer(dispatchFlags.csrDependencies, commandStreamReceiver); + eventsRequest.fillCsrDependenciesForRootDevices(dispatchFlags.csrDependencies, commandStreamReceiver); } const bool isHandlingBarrier = commandQueue.getGpgpuCommandStreamReceiver().isStallingCommandsOnNextFlushRequired(); diff --git a/opencl/source/helpers/task_information.h b/opencl/source/helpers/task_information.h index 68f378c693..c3ebd01b6c 100644 --- a/opencl/source/helpers/task_information.h +++ b/opencl/source/helpers/task_information.h @@ -127,7 +127,7 @@ class CommandComputeKernel : public Command { public: CommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr &kernelOperation, std::vector surfaces, bool flushDC, bool usesSLM, uint32_t commandType, std::unique_ptr &&printfHandler, - PreemptionMode preemptionMode, Kernel *kernel, uint32_t kernelCount); + PreemptionMode preemptionMode, Kernel *kernel, uint32_t kernelCount, TagNodeBase *multiRootDeviceSyncNode); ~CommandComputeKernel() override; @@ -146,6 +146,7 @@ class CommandComputeKernel : public Command { Kernel *kernel; uint32_t kernelCount; PreemptionMode preemptionMode; + TagNodeBase *multiRootDeviceSyncNode; }; class CommandWithoutKernel : public Command { diff --git a/opencl/test/unit_test/command_queue/command_queue_hw_1_tests.cpp b/opencl/test/unit_test/command_queue/command_queue_hw_1_tests.cpp index fa064de0cd..c369349555 100644 --- a/opencl/test/unit_test/command_queue/command_queue_hw_1_tests.cpp +++ b/opencl/test/unit_test/command_queue/command_queue_hw_1_tests.cpp @@ -19,6 +19,7 @@ #include "opencl/test/unit_test/command_queue/command_queue_fixture.h" #include "opencl/test/unit_test/fixtures/buffer_fixture.h" #include "opencl/test/unit_test/fixtures/image_fixture.h" +#include "opencl/test/unit_test/helpers/cl_hw_parse.h" #include "opencl/test/unit_test/mocks/mock_command_queue.h" #include "opencl/test/unit_test/mocks/mock_event.h" #include "opencl/test/unit_test/mocks/mock_kernel.h" @@ -193,6 +194,7 @@ class MockCommandStreamReceiverWithFailingFlushBatchedSubmission : public MockCo template struct MockCommandQueueHwWithOverwrittenCsr : public CommandQueueHw { using CommandQueueHw::CommandQueueHw; + using CommandQueueHw::timestampPacketContainer; MockCommandStreamReceiverWithFailingFlushBatchedSubmission *csr; CommandStreamReceiver &getGpgpuCommandStreamReceiver() const override { return *csr; } }; diff --git a/opencl/test/unit_test/command_queue/command_queue_hw_2_tests.cpp b/opencl/test/unit_test/command_queue/command_queue_hw_2_tests.cpp index 2ea8f48fed..7fd8c44ef5 100644 --- a/opencl/test/unit_test/command_queue/command_queue_hw_2_tests.cpp +++ b/opencl/test/unit_test/command_queue/command_queue_hw_2_tests.cpp @@ -23,6 +23,7 @@ #include "opencl/test/unit_test/mocks/mock_command_queue.h" #include "opencl/test/unit_test/mocks/mock_event.h" #include "opencl/test/unit_test/mocks/mock_kernel.h" +#include "opencl/test/unit_test/mocks/mock_mdi.h" using namespace NEO; @@ -972,4 +973,4 @@ HWTEST_F(CommandQueueHwTest, GivenBuiltinKernelWhenBuiltinDispatchInfoBuilderIsP EXPECT_EQ(builder.paramsToUse.elws.x, dispatchInfo->getEnqueuedWorkgroupSize().x); EXPECT_EQ(builder.paramsToUse.offset.x, dispatchInfo->getOffset().x); EXPECT_EQ(builder.paramsToUse.kernel, dispatchInfo->getKernel()); -} +} \ No newline at end of file diff --git a/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp b/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp index c43df32c18..a902334eb6 100644 --- a/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp +++ b/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp @@ -31,6 +31,7 @@ #include "opencl/test/unit_test/fixtures/cl_device_fixture.h" #include "opencl/test/unit_test/mocks/mock_buffer.h" #include "opencl/test/unit_test/mocks/mock_command_queue.h" +#include "opencl/test/unit_test/mocks/mock_event.h" #include "opencl/test/unit_test/mocks/mock_kernel.h" #include "opencl/test/unit_test/mocks/mock_mdi.h" #include "opencl/test/unit_test/mocks/mock_program.h" diff --git a/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_and_later.cpp b/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_and_later.cpp index 3b36aa347a..dbc921929b 100644 --- a/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_and_later.cpp +++ b/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_and_later.cpp @@ -557,11 +557,11 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, givenTimestamp MockMultiDispatchInfo multiDispatchInfo(device.get(), std::vector({kernel1.mockKernel, kernel2.mockKernel})); device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = false; - getCommandStream(cmdQ, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false); + getCommandStream(cmdQ, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false, nullptr); size_t sizeWithDisabled = cmdQ.requestedCmdStreamSize; device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; - getCommandStream(cmdQ, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false); + getCommandStream(cmdQ, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false, nullptr); size_t sizeWithEnabled = cmdQ.requestedCmdStreamSize; size_t additionalSize = 0u; @@ -669,7 +669,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, givenAutoLocal EXPECT_EQ((uint32_t)(expectedKernelStartOffset), idd.getKernelStartPointer()); auto expectedSizeCS = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, CsrDependencies(), false, false, - false, *cmdQ.get(), multiDispatchInfo, false, false); + false, *cmdQ.get(), multiDispatchInfo, false, false, nullptr); expectedSizeCS += sizeof(typename FamilyType::MI_BATCH_BUFFER_END); expectedSizeCS = alignUp(expectedSizeCS, MemoryConstants::cacheLineSize); EXPECT_GE(expectedSizeCS, usedAfterCS - usedBeforeCS); @@ -738,7 +738,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, givenPassInlin EXPECT_EQ((uint32_t)(expectedKernelStartOffset), idd.getKernelStartPointer()); auto expectedSizeCS = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, CsrDependencies(), false, false, - false, *cmdQ.get(), multiDispatchInfo, false, false); + false, *cmdQ.get(), multiDispatchInfo, false, false, nullptr); expectedSizeCS += sizeof(typename FamilyType::MI_BATCH_BUFFER_END); expectedSizeCS = alignUp(expectedSizeCS, MemoryConstants::cacheLineSize); EXPECT_GE(expectedSizeCS, usedAfterCS - usedBeforeCS); diff --git a/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp index edd2fb1601..f06bff02e4 100644 --- a/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp @@ -24,6 +24,7 @@ #include "opencl/source/mem_obj/buffer.h" #include "opencl/test/unit_test/fixtures/dispatch_flags_fixture.h" #include "opencl/test/unit_test/fixtures/enqueue_handler_fixture.h" +#include "opencl/test/unit_test/fixtures/multi_root_device_fixture.h" #include "opencl/test/unit_test/mocks/mock_command_queue.h" #include "opencl/test/unit_test/mocks/mock_event.h" #include "opencl/test/unit_test/mocks/mock_kernel.h" @@ -234,7 +235,7 @@ HWTEST_F(EnqueueHandlerTest, givenNonBlitPropertyWhenEnqueueIsBlockedThenDontReg Surface *surfaces[] = {nullptr}; mockCmdQ->enqueueBlocked(CL_COMMAND_MARKER, surfaces, size_t(0), multiDispatchInfo, timestampPacketDependencies, blockedCommandsData, enqueuePropertiesForDependencyFlush, eventsRequest, - eventBuilder, std::unique_ptr(nullptr), nullptr); + eventBuilder, std::unique_ptr(nullptr), nullptr, nullptr); EXPECT_FALSE(blockedCommandsDataForDependencyFlush->blitEnqueue); } @@ -267,7 +268,7 @@ HWTEST_F(EnqueueHandlerTest, givenBlitPropertyWhenEnqueueIsBlockedThenRegisterBl Surface *surfaces[] = {nullptr}; mockCmdQ->enqueueBlocked(CL_COMMAND_READ_BUFFER, surfaces, size_t(0), multiDispatchInfo, timestampPacketDependencies, blockedCommandsData, enqueuePropertiesForBlitEnqueue, eventsRequest, - eventBuilder, std::unique_ptr(nullptr), mockCmdQ->getBcsForAuxTranslation()); + eventBuilder, std::unique_ptr(nullptr), mockCmdQ->getBcsForAuxTranslation(), nullptr); EXPECT_TRUE(blockedCommandsDataForBlitEnqueue->blitEnqueue); EXPECT_EQ(blitProperties.srcAllocation, blockedCommandsDataForBlitEnqueue->blitPropertiesContainer.begin()->srcAllocation); EXPECT_EQ(blitProperties.dstAllocation, blockedCommandsDataForBlitEnqueue->blitPropertiesContainer.begin()->dstAllocation); @@ -351,7 +352,7 @@ HWTEST_F(DispatchFlagsBlitTests, givenBlitEnqueueWhenDispatchingCommandsWithoutK timestampPacketDependencies.cacheFlushNodes.add(mockCmdQ->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator()->getTag()); BlitProperties blitProperties = mockCmdQ->processDispatchForBlitEnqueue(bcsCsr, multiDispatchInfo, timestampPacketDependencies, - eventsRequest, &mockCmdQ->getCS(0), CL_COMMAND_READ_BUFFER, false); + eventsRequest, &mockCmdQ->getCS(0), CL_COMMAND_READ_BUFFER, false, nullptr); BlitPropertiesContainer blitPropertiesContainer; blitPropertiesContainer.push_back(blitProperties); @@ -390,7 +391,7 @@ HWTEST_F(DispatchFlagsBlitTests, givenBlitOperationWhenEnqueueCommandWithoutKern CsrDependencies csrDeps; BlitProperties blitProperties = mockCmdQ->processDispatchForBlitEnqueue(bcsCsr, multiDispatchInfo, timestampPacketDependencies, - eventsRequest, &mockCmdQ->getCS(0), CL_COMMAND_READ_BUFFER, false); + eventsRequest, &mockCmdQ->getCS(0), CL_COMMAND_READ_BUFFER, false, nullptr); BlitPropertiesContainer blitPropertiesContainer; blitPropertiesContainer.push_back(blitProperties); @@ -432,7 +433,7 @@ HWTEST_F(DispatchFlagsBlitTests, givenN1EnabledWhenDispatchingWithoutKernelThenA mockCmdQ->obtainNewTimestampPacketNodes(1, timestampPacketDependencies.previousEnqueueNodes, true, bcsCsr); timestampPacketDependencies.cacheFlushNodes.add(mockCmdQ->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator()->getTag()); BlitProperties blitProperties = mockCmdQ->processDispatchForBlitEnqueue(bcsCsr, multiDispatchInfo, timestampPacketDependencies, - eventsRequest, &mockCmdQ->getCS(0), CL_COMMAND_READ_BUFFER, false); + eventsRequest, &mockCmdQ->getCS(0), CL_COMMAND_READ_BUFFER, false, nullptr); BlitPropertiesContainer blitPropertiesContainer; blitPropertiesContainer.push_back(blitProperties); @@ -478,7 +479,7 @@ HWTEST_F(DispatchFlagsTests, givenMockKernelWhenSettingAdditionalKernelExecInfoT std::vector v; pKernel->setAdditionalKernelExecInfo(123u); - std::unique_ptr cmd(new CommandComputeKernel(*mockCmdQ.get(), blockedCommandsData, v, false, false, false, std::move(printfHandler), PreemptionMode::Disabled, pKernel, 1)); + std::unique_ptr cmd(new CommandComputeKernel(*mockCmdQ.get(), blockedCommandsData, v, false, false, false, std::move(printfHandler), PreemptionMode::Disabled, pKernel, 1, nullptr)); cmd->submit(1u, false); EXPECT_EQ(mockCsr->passedDispatchFlags.additionalKernelExecInfo, 123u); @@ -541,4 +542,41 @@ HWTEST_F(EnqueueHandlerTest, givenTimestampPacketWriteDisabledAndCommandWithCach EXPECT_EQ(nullptr, container); clReleaseEvent(event); } -} // namespace NEO + +template +class MockCommandQueueWithProcessSignal : public MockCommandQueueHw { + using MockCommandQueueHw::MockCommandQueueHw; + + public: + void processSignalMultiRootDeviceNode(LinearStream *commandStream, + TagNodeBase *node) override { + processSignalMultiRootDeviceNodeCalled++; + } + uint32_t processSignalMultiRootDeviceNodeCalled = 0; +}; + +using EnqueueHandlerMultiRootSync = MultiRootDeviceFixture; + +HWTEST_F(EnqueueHandlerMultiRootSync, givenOutEventInMultiRootContextWhenEnqueuehandlerForMapOperationCalledThenMultiRootTagIsNotSignaled) { + auto mockCmdQ = std::make_unique>(context.get(), device1, nullptr); + auto event = std::make_unique>(context.get(), nullptr, 0, 0, 0); + cl_event clEvent = event.get(); + + MultiDispatchInfo multiDispatch; + mockCmdQ->template enqueueHandler(nullptr, 0, false, multiDispatch, 0, nullptr, &clEvent); + EXPECT_EQ(mockCmdQ->processSignalMultiRootDeviceNodeCalled, 0u); + clReleaseEvent(clEvent); +} + +HWTEST_F(EnqueueHandlerMultiRootSync, givenOutEventInMultiRootContextWhenEnqueuehandlerForMarkerOperationCalledThenMultiRootTagIsSignaled) { + auto mockCmdQ = std::make_unique>(context.get(), device1, nullptr); + auto event = std::make_unique>(context.get(), nullptr, 0, 0, 0); + cl_event clEvent = event.get(); + + MultiDispatchInfo multiDispatch; + mockCmdQ->template enqueueHandler(nullptr, 0, false, multiDispatch, 0, nullptr, &clEvent); + EXPECT_EQ(mockCmdQ->processSignalMultiRootDeviceNodeCalled, 1u); + clReleaseEvent(clEvent); +} + +} // namespace NEO \ No newline at end of file diff --git a/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests_dg2_and_later.cpp b/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests_dg2_and_later.cpp index f51f97a02b..f3d47baf7a 100644 --- a/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests_dg2_and_later.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests_dg2_and_later.cpp @@ -54,7 +54,7 @@ HWTEST2_F(DispatchFlagsTests, whenSubmittingKernelWithAdditionalKernelExecInfoTh std::vector v; pKernel->setAdditionalKernelExecInfo(AdditionalKernelExecInfo::DisableOverdispatch); - std::unique_ptr cmd(new CommandComputeKernel(*mockCmdQ.get(), blockedCommandsData, v, false, false, false, std::move(printfHandler), PreemptionMode::Disabled, pKernel, 1)); + std::unique_ptr cmd(new CommandComputeKernel(*mockCmdQ.get(), blockedCommandsData, v, false, false, false, std::move(printfHandler), PreemptionMode::Disabled, pKernel, 1, nullptr)); cmd->submit(1u, false); EXPECT_EQ(mockCsr->passedDispatchFlags.additionalKernelExecInfo, AdditionalKernelExecInfo::DisableOverdispatch); diff --git a/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp index 2565649421..390343b2be 100644 --- a/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp @@ -2005,10 +2005,10 @@ HWTEST_F(PauseOnGpuTests, givenGpuScratchWriteEnabledWhenEstimatingCommandStream dispatchInfo.setKernel(mockKernel.mockKernel); multiDispatchInfo.push(dispatchInfo); - auto baseCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false); + auto baseCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, nullptr); DebugManager.flags.GpuScratchRegWriteAfterWalker.set(1); - auto extendedCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false); + auto extendedCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, nullptr); EXPECT_EQ(baseCommandStreamSize + sizeof(typename FamilyType::MI_LOAD_REGISTER_IMM), extendedCommandStreamSize); } diff --git a/opencl/test/unit_test/command_queue/enqueue_kernel_2_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_kernel_2_tests.cpp index 49871b6595..39cf18a236 100644 --- a/opencl/test/unit_test/command_queue/enqueue_kernel_2_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_kernel_2_tests.cpp @@ -1014,8 +1014,8 @@ HWTEST_F(EnqueueKernelTest, givenTimestampWriteEnableWhenMarkerProfilingWithoutW dispatchInfo.setKernel(mockKernel.mockKernel); multiDispatchInfo.push(dispatchInfo); - auto baseCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false); - auto extendedCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, true, false); + auto baseCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, nullptr); + auto extendedCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, true, false, nullptr); EXPECT_EQ(baseCommandStreamSize + 4 * EncodeStoreMMIO::size + MemorySynchronizationCommands::getSizeForSingleBarrier(false), extendedCommandStreamSize); } @@ -1033,8 +1033,8 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, EnqueueKernelTest, givenTimestampWriteEnableOnMulti dispatchInfo.setKernel(mockKernel.mockKernel); multiDispatchInfo.push(dispatchInfo); - auto baseCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false); - auto extendedCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, true, false); + auto baseCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, nullptr); + auto extendedCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, true, false, nullptr); EXPECT_EQ(baseCommandStreamSize + 4 * EncodeStoreMMIO::size + ImplicitScalingDispatch::getBarrierSize(csr.peekHwInfo(), false, false), extendedCommandStreamSize); } @@ -1047,8 +1047,8 @@ HWTEST_F(EnqueueKernelTest, givenTimestampWriteEnableWhenMarkerProfilingWithWait dispatchInfo.setKernel(mockKernel.mockKernel); multiDispatchInfo.push(dispatchInfo); - auto baseCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false); - auto extendedCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, true, true); + auto baseCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, nullptr); + auto extendedCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, true, true, nullptr); EXPECT_EQ(baseCommandStreamSize + 4 * EncodeStoreMMIO::size, extendedCommandStreamSize); } diff --git a/opencl/test/unit_test/command_queue/get_size_required_buffer_tests.cpp b/opencl/test/unit_test/command_queue/get_size_required_buffer_tests.cpp index 3587e5e578..f0d5895991 100644 --- a/opencl/test/unit_test/command_queue/get_size_required_buffer_tests.cpp +++ b/opencl/test/unit_test/command_queue/get_size_required_buffer_tests.cpp @@ -23,6 +23,7 @@ #include "opencl/test/unit_test/fixtures/hello_world_kernel_fixture.h" #include "opencl/test/unit_test/fixtures/image_fixture.h" #include "opencl/test/unit_test/fixtures/simple_arg_kernel_fixture.h" +#include "opencl/test/unit_test/mocks/mock_event.h" using namespace NEO; @@ -96,7 +97,7 @@ HWTEST_F(GetSizeRequiredBufferTest, WhenFillingBufferThenHeapsAndCommandBufferCo auto usedAfterSSH = ssh.getUsed(); auto expectedSizeCS = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_FILL_BUFFER, CsrDependencies(), false, false, - false, *pCmdQ, multiDispatchInfo, false, false); + false, *pCmdQ, multiDispatchInfo, false, false, nullptr); auto expectedSizeDSH = HardwareCommandsHelper::getTotalSizeRequiredDSH(multiDispatchInfo); auto expectedSizeIOH = HardwareCommandsHelper::getTotalSizeRequiredIOH(multiDispatchInfo); auto expectedSizeSSH = HardwareCommandsHelper::getTotalSizeRequiredSSH(multiDispatchInfo); @@ -149,7 +150,7 @@ HWTEST_F(GetSizeRequiredBufferTest, WhenCopyingBufferThenHeapsAndCommandBufferCo auto usedAfterSSH = ssh.getUsed(); auto expectedSizeCS = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_COPY_BUFFER, CsrDependencies(), false, false, - false, *pCmdQ, multiDispatchInfo, false, false); + false, *pCmdQ, multiDispatchInfo, false, false, nullptr); auto expectedSizeDSH = HardwareCommandsHelper::getTotalSizeRequiredDSH(multiDispatchInfo); auto expectedSizeIOH = HardwareCommandsHelper::getTotalSizeRequiredIOH(multiDispatchInfo); auto expectedSizeSSH = HardwareCommandsHelper::getTotalSizeRequiredSSH(multiDispatchInfo); @@ -203,7 +204,7 @@ HWTEST_F(GetSizeRequiredBufferTest, WhenReadingBufferNonBlockingThenHeapsAndComm auto usedAfterSSH = ssh.getUsed(); auto expectedSizeCS = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_READ_BUFFER, CsrDependencies(), false, false, - false, *pCmdQ, multiDispatchInfo, false, false); + false, *pCmdQ, multiDispatchInfo, false, false, nullptr); auto expectedSizeDSH = HardwareCommandsHelper::getTotalSizeRequiredDSH(multiDispatchInfo); auto expectedSizeIOH = HardwareCommandsHelper::getTotalSizeRequiredIOH(multiDispatchInfo); auto expectedSizeSSH = HardwareCommandsHelper::getTotalSizeRequiredSSH(multiDispatchInfo); @@ -258,7 +259,7 @@ HWTEST_F(GetSizeRequiredBufferTest, WhenReadingBufferBlockingThenThenHeapsAndCom auto usedAfterSSH = ssh.getUsed(); auto expectedSizeCS = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_READ_BUFFER, CsrDependencies(), false, false, - false, *pCmdQ, multiDispatchInfo, false, false); + false, *pCmdQ, multiDispatchInfo, false, false, nullptr); auto expectedSizeDSH = HardwareCommandsHelper::getTotalSizeRequiredDSH(multiDispatchInfo); auto expectedSizeIOH = HardwareCommandsHelper::getTotalSizeRequiredIOH(multiDispatchInfo); auto expectedSizeSSH = HardwareCommandsHelper::getTotalSizeRequiredSSH(multiDispatchInfo); @@ -313,7 +314,7 @@ HWTEST_F(GetSizeRequiredBufferTest, WhenWritingBufferNonBlockingThenHeapsAndComm auto usedAfterSSH = ssh.getUsed(); auto expectedSizeCS = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_WRITE_BUFFER, CsrDependencies(), false, false, - false, *pCmdQ, multiDispatchInfo, false, false); + false, *pCmdQ, multiDispatchInfo, false, false, nullptr); auto expectedSizeDSH = HardwareCommandsHelper::getTotalSizeRequiredDSH(multiDispatchInfo); auto expectedSizeIOH = HardwareCommandsHelper::getTotalSizeRequiredIOH(multiDispatchInfo); auto expectedSizeSSH = HardwareCommandsHelper::getTotalSizeRequiredSSH(multiDispatchInfo); @@ -365,7 +366,7 @@ HWTEST_F(GetSizeRequiredBufferTest, WhenWritingBufferBlockingThenHeapsAndCommand auto usedAfterSSH = ssh.getUsed(); auto expectedSizeCS = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_WRITE_BUFFER, CsrDependencies(), false, false, - false, *pCmdQ, multiDispatchInfo, false, false); + false, *pCmdQ, multiDispatchInfo, false, false, nullptr); auto expectedSizeDSH = HardwareCommandsHelper::getTotalSizeRequiredDSH(multiDispatchInfo); auto expectedSizeIOH = HardwareCommandsHelper::getTotalSizeRequiredIOH(multiDispatchInfo); auto expectedSizeSSH = HardwareCommandsHelper::getTotalSizeRequiredSSH(multiDispatchInfo); @@ -380,6 +381,68 @@ HWTEST_F(GetSizeRequiredBufferTest, WhenWritingBufferBlockingThenHeapsAndCommand EXPECT_GE(expectedSizeSSH, usedAfterSSH - usedBeforeSSH); } +HWTEST_F(GetSizeRequiredBufferTest, GivenOutEventForSingleDeviceContextWhenCalculatingCSSizeThenExtraPipeControlIsNotAdded) { + UltClDeviceFactory deviceFactory{1, 0}; + DebugManager.flags.EnableMultiRootDeviceContexts.set(true); + + cl_device_id devices[] = {deviceFactory.rootDevices[0]}; + + MockContext pContext(ClDeviceVector(devices, 1)); + MockKernelWithInternals mockKernel(*pContext.getDevices()[0]); + DispatchInfo dispatchInfo; + MultiDispatchInfo multiDispatchInfo(mockKernel.mockKernel); + dispatchInfo.setKernel(mockKernel.mockKernel); + multiDispatchInfo.push(dispatchInfo); + auto event = std::make_unique>(&pContext, nullptr, 0, 0, 0); + cl_event clEvent = event.get(); + auto baseCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, nullptr); + auto extendedCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, &clEvent); + + EXPECT_EQ(baseCommandStreamSize, extendedCommandStreamSize); +} + +HWTEST_F(GetSizeRequiredBufferTest, GivenUserEventForMultiDeviceContextWhenCalculatingCSSizeThenExtraPipeControlIsNotAdded) { + UltClDeviceFactory deviceFactory{2, 0}; + DebugManager.flags.EnableMultiRootDeviceContexts.set(true); + + cl_device_id devices[] = {deviceFactory.rootDevices[0], + deviceFactory.rootDevices[1]}; + + MockContext pContext(ClDeviceVector(devices, 2)); + MockKernelWithInternals mockKernel(*pContext.getDevices()[0]); + DispatchInfo dispatchInfo; + MultiDispatchInfo multiDispatchInfo(mockKernel.mockKernel); + dispatchInfo.setKernel(mockKernel.mockKernel); + multiDispatchInfo.push(dispatchInfo); + auto userEvent1 = std::make_unique(&pContext); + cl_event clEvent = userEvent1.get(); + auto baseCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, nullptr); + auto extendedCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, &clEvent); + + EXPECT_EQ(baseCommandStreamSize, extendedCommandStreamSize); +} + +HWTEST_F(GetSizeRequiredBufferTest, GivenOutEventForMultiDeviceContextWhenCalculatingCSSizeThenExtraPipeControlIsAdded) { + UltClDeviceFactory deviceFactory{2, 0}; + DebugManager.flags.EnableMultiRootDeviceContexts.set(true); + + cl_device_id devices[] = {deviceFactory.rootDevices[0], + deviceFactory.rootDevices[1]}; + + MockContext pContext(ClDeviceVector(devices, 2)); + MockKernelWithInternals mockKernel(*pContext.getDevices()[0]); + DispatchInfo dispatchInfo; + MultiDispatchInfo multiDispatchInfo(mockKernel.mockKernel); + dispatchInfo.setKernel(mockKernel.mockKernel); + multiDispatchInfo.push(dispatchInfo); + auto event = std::make_unique>(&pContext, nullptr, 0, 0, 0); + cl_event clEvent = event.get(); + auto baseCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, nullptr); + auto extendedCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, &clEvent); + + EXPECT_EQ(baseCommandStreamSize + MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(pContext.getDevices()[0]->getHardwareInfo(), false), extendedCommandStreamSize); +} + HWTEST_F(GetSizeRequiredBufferTest, givenMultipleKernelRequiringSshWhenTotalSizeIsComputedThenItIsProperlyAligned) { auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(EBuiltInOps::CopyBufferToBuffer, pCmdQ->getClDevice()); diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_3_tests.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_3_tests.cpp index 1a3c2e8517..10a032bbf4 100644 --- a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_3_tests.cpp +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_3_tests.cpp @@ -1904,7 +1904,7 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, GivenBlockedKernelWhenItIsUnblocke blockedCommandsData->setHeaps(dsh, ioh, ssh); std::vector surfaces; - event->setCommand(std::make_unique(*pCmdQ, blockedCommandsData, surfaces, false, false, false, nullptr, pDevice->getPreemptionMode(), pKernel, 1)); + event->setCommand(std::make_unique(*pCmdQ, blockedCommandsData, surfaces, false, false, false, nullptr, pDevice->getPreemptionMode(), pKernel, 1, nullptr)); event->submitCommand(false); EXPECT_EQ(numGrfRequired, csr->savedDispatchFlags.numGrfRequired); @@ -1949,7 +1949,7 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, GivenBlockedKernelWhenInitializeBc auto blockedCommandsData = std::make_unique(cmdStream, *pCmdQ->getGpgpuCommandStreamReceiver().getInternalAllocationStorage()); std::vector surfaces; - event->setCommand(std::make_unique(*pCmdQ, blockedCommandsData, surfaces, false, false, false, nullptr, pDevice->getPreemptionMode(), pKernel, 1)); + event->setCommand(std::make_unique(*pCmdQ, blockedCommandsData, surfaces, false, false, false, nullptr, pDevice->getPreemptionMode(), pKernel, 1, nullptr)); event->submitCommand(false); EXPECT_FALSE(pCmdQ->isCsrLocked); } diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_4_tests.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_4_tests.cpp index e04eac63b8..b9d282d5e1 100644 --- a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_4_tests.cpp +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_4_tests.cpp @@ -8,6 +8,7 @@ #include "shared/source/command_stream/wait_status.h" #include "shared/source/helpers/timestamp_packet.h" #include "shared/test/common/mocks/mock_command_stream_receiver.h" +#include "shared/test/common/mocks/mock_timestamp_container.h" #include "shared/test/common/mocks/ult_device_factory.h" #include "shared/test/common/test_macros/hw_test.h" @@ -15,6 +16,7 @@ #include "opencl/source/event/user_event.h" #include "opencl/test/unit_test/fixtures/multi_root_device_fixture.h" #include "opencl/test/unit_test/fixtures/ult_command_stream_receiver_fixture.h" +#include "opencl/test/unit_test/mocks/mock_event.h" #include "opencl/test/unit_test/mocks/mock_kernel.h" #include "opencl/test/unit_test/mocks/mock_program.h" #include "opencl/test/unit_test/test_macros/test_checks_ocl.h" @@ -46,12 +48,18 @@ HWTEST_F(MultiRootDeviceCommandStreamReceiverBufferTests, givenMultipleEventInMu MockGraphicsAllocation svmAlloc(svmPtr, svmSize); Event event1(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 5, 15); + auto node1 = event1.getMultiRootTimestampSyncNode(); Event event2(nullptr, CL_COMMAND_NDRANGE_KERNEL, 6, 16); Event event3(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 4, 20); + auto node3 = event3.getMultiRootTimestampSyncNode(); Event event4(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 3, 4); + auto node4 = event4.getMultiRootTimestampSyncNode(); Event event5(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 2, 7); + auto node5 = event5.getMultiRootTimestampSyncNode(); UserEvent userEvent1(&pCmdQ1->getContext()); + userEvent1.getMultiRootTimestampSyncNode(); UserEvent userEvent2(&pCmdQ2->getContext()); + userEvent2.getMultiRootTimestampSyncNode(); userEvent1.setStatus(CL_COMPLETE); userEvent2.setStatus(CL_COMPLETE); @@ -88,12 +96,12 @@ HWTEST_F(MultiRootDeviceCommandStreamReceiverBufferTests, givenMultipleEventInMu EXPECT_EQ(2u, semaphores.size()); auto semaphoreCmd0 = genCmdCast(*(semaphores[0])); - EXPECT_EQ(4u, semaphoreCmd0->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ2->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd0->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(node4->getContextEndAddress(0u)), semaphoreCmd0->getSemaphoreGraphicsAddress()); auto semaphoreCmd1 = genCmdCast(*(semaphores[1])); - EXPECT_EQ(7u, semaphoreCmd1->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ2->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd1->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd1->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(node5->getContextEndAddress(0u)), semaphoreCmd1->getSemaphoreGraphicsAddress()); } { @@ -116,12 +124,12 @@ HWTEST_F(MultiRootDeviceCommandStreamReceiverBufferTests, givenMultipleEventInMu EXPECT_EQ(2u, semaphores.size()); auto semaphoreCmd0 = genCmdCast(*(semaphores[0])); - EXPECT_EQ(15u, semaphoreCmd0->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd0->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(node1->getContextEndAddress(0u)), semaphoreCmd0->getSemaphoreGraphicsAddress()); auto semaphoreCmd1 = genCmdCast(*(semaphores[1])); - EXPECT_EQ(20u, semaphoreCmd1->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd1->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd1->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(node3->getContextEndAddress(0u)), semaphoreCmd1->getSemaphoreGraphicsAddress()); } alignedFree(svmPtr); } @@ -148,17 +156,24 @@ HWTEST_F(MultiRootDeviceCommandStreamReceiverTests, givenMultipleEventInMultiRoo cl_device_id devices[] = {device1, device2, device3}; auto context = std::make_unique(ClDeviceVector(devices, 3), false); - + auto mockTagAllocator = std::make_unique>(context->getRootDeviceIndices(), device1->getExecutionEnvironment()->memoryManager.get(), 10u); + std::unique_ptr uniquePtr(mockTagAllocator.release()); + context->setMultiRootDeviceTimestampPacketAllocator(uniquePtr); auto pCmdQ1 = context->getSpecialQueue(1u); auto pCmdQ2 = context->getSpecialQueue(2u); auto pCmdQ3 = context->getSpecialQueue(3u); Event event1(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 5, 15); + auto node1 = event1.getMultiRootTimestampSyncNode(); Event event2(nullptr, CL_COMMAND_NDRANGE_KERNEL, 6, 16); Event event3(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 4, 20); + auto node3 = event3.getMultiRootTimestampSyncNode(); Event event4(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 3, 4); + auto node4 = event4.getMultiRootTimestampSyncNode(); Event event5(pCmdQ3, CL_COMMAND_NDRANGE_KERNEL, 7, 21); + auto node5 = event5.getMultiRootTimestampSyncNode(); Event event6(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 2, 7); + auto node6 = event6.getMultiRootTimestampSyncNode(); UserEvent userEvent1(&pCmdQ1->getContext()); UserEvent userEvent2(&pCmdQ2->getContext()); @@ -191,16 +206,16 @@ HWTEST_F(MultiRootDeviceCommandStreamReceiverTests, givenMultipleEventInMultiRoo EXPECT_EQ(3u, semaphores.size()); auto semaphoreCmd0 = genCmdCast(*(semaphores[0])); - EXPECT_EQ(4u, semaphoreCmd0->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ2->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd0->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(node4->getContextEndAddress(0u)), semaphoreCmd0->getSemaphoreGraphicsAddress()); auto semaphoreCmd1 = genCmdCast(*(semaphores[1])); - EXPECT_EQ(21u, semaphoreCmd1->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ3->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd1->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd1->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(node5->getContextEndAddress(0u)), semaphoreCmd1->getSemaphoreGraphicsAddress()); auto semaphoreCmd2 = genCmdCast(*(semaphores[2])); - EXPECT_EQ(7u, semaphoreCmd2->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ2->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd2->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd2->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(node6->getContextEndAddress(0u)), semaphoreCmd2->getSemaphoreGraphicsAddress()); } { @@ -216,16 +231,16 @@ HWTEST_F(MultiRootDeviceCommandStreamReceiverTests, givenMultipleEventInMultiRoo EXPECT_EQ(3u, semaphores.size()); auto semaphoreCmd0 = genCmdCast(*(semaphores[0])); - EXPECT_EQ(15u, semaphoreCmd0->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd0->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(node1->getContextEndAddress(0u)), semaphoreCmd0->getSemaphoreGraphicsAddress()); auto semaphoreCmd1 = genCmdCast(*(semaphores[1])); - EXPECT_EQ(20u, semaphoreCmd1->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd1->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd1->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(node3->getContextEndAddress(0u)), semaphoreCmd1->getSemaphoreGraphicsAddress()); auto semaphoreCmd2 = genCmdCast(*(semaphores[2])); - EXPECT_EQ(21u, semaphoreCmd2->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ3->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd2->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd2->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(node5->getContextEndAddress(0u)), semaphoreCmd2->getSemaphoreGraphicsAddress()); } { @@ -250,8 +265,8 @@ HWTEST_F(MultiRootDeviceCommandStreamReceiverTests, givenMultipleEventInMultiRoo EXPECT_EQ(1u, semaphores.size()); auto semaphoreCmd0 = genCmdCast(*(semaphores[0])); - EXPECT_EQ(15u, semaphoreCmd0->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd0->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(node1->getContextEndAddress(0u)), semaphoreCmd0->getSemaphoreGraphicsAddress()); } } @@ -287,11 +302,16 @@ HWTEST_F(CrossDeviceDependenciesTests, givenMultipleEventInMultiRootDeviceEnviro using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; Event event1(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 5, 15); + event1.getMultiRootTimestampSyncNode(); Event event2(nullptr, CL_COMMAND_NDRANGE_KERNEL, 6, 16); Event event3(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 1, 6); + event3.getMultiRootTimestampSyncNode(); Event event4(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 4, 20); + event4.getMultiRootTimestampSyncNode(); Event event5(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 3, 4); + event5.getMultiRootTimestampSyncNode(); Event event6(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 2, 7); + event6.getMultiRootTimestampSyncNode(); UserEvent userEvent1(&pCmdQ1->getContext()); UserEvent userEvent2(&pCmdQ2->getContext()); @@ -317,10 +337,10 @@ HWTEST_F(CrossDeviceDependenciesTests, givenMultipleEventInMultiRootDeviceEnviro EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, nullptr); CsrDependencies csrDeps; - eventsRequest.fillCsrDependenciesForTaskCountContainer(csrDeps, pCmdQ1->getGpgpuCommandStreamReceiver()); + eventsRequest.fillCsrDependenciesForRootDevices(csrDeps, pCmdQ1->getGpgpuCommandStreamReceiver()); - EXPECT_EQ(0u, csrDeps.taskCountContainer.size()); - EXPECT_EQ(0u, TimestampPacketHelper::getRequiredCmdStreamSizeForTaskCountContainer(csrDeps)); + EXPECT_EQ(0u, csrDeps.multiRootTimeStampSyncContainer.size()); + EXPECT_EQ(0u, TimestampPacketHelper::getRequiredCmdStreamSizeForMultiRootDeviceSyncNodesContainer(csrDeps)); } { @@ -343,10 +363,10 @@ HWTEST_F(CrossDeviceDependenciesTests, givenMultipleEventInMultiRootDeviceEnviro EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, nullptr); CsrDependencies csrDeps; - eventsRequest.fillCsrDependenciesForTaskCountContainer(csrDeps, pCmdQ2->getGpgpuCommandStreamReceiver()); + eventsRequest.fillCsrDependenciesForRootDevices(csrDeps, pCmdQ2->getGpgpuCommandStreamReceiver()); - EXPECT_EQ(3u, csrDeps.taskCountContainer.size()); - EXPECT_EQ(3u * sizeof(MI_SEMAPHORE_WAIT), TimestampPacketHelper::getRequiredCmdStreamSizeForTaskCountContainer(csrDeps)); + EXPECT_EQ(3u, csrDeps.multiRootTimeStampSyncContainer.size()); + EXPECT_EQ(3u * sizeof(MI_SEMAPHORE_WAIT), TimestampPacketHelper::getRequiredCmdStreamSizeForMultiRootDeviceSyncNodesContainer(csrDeps)); } } @@ -377,6 +397,7 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW cl_event outputEvent2{}; + auto currentCsUsedCmdq1 = pCmdQ1->getCS(0).getUsed(); pCmdQ2->enqueueReadBuffer(buffer, CL_FALSE, 0, MemoryConstants::pageSize, hostPtr, nullptr, 1, &outputEvent1, @@ -400,14 +421,12 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW nullptr); { HardwareParse csHwParser; - csHwParser.parseCommands(pCmdQ1->getCS(0)); + csHwParser.parseCommands(pCmdQ1->getCS(0), currentCsUsedCmdq1); auto semaphores = findAll(csHwParser.cmdList.begin(), csHwParser.cmdList.end()); EXPECT_EQ(0u, semaphores.size()); } userEvent1.setStatus(CL_COMPLETE); - event1->release(); - event2->release(); pCmdQ1->finish(); pCmdQ2->finish(); { @@ -418,7 +437,7 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW EXPECT_EQ(1u, semaphores.size()); auto semaphoreCmd = genCmdCast(*(semaphores[0])); EXPECT_EQ(1u, semaphoreCmd->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ2->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd->getSemaphoreGraphicsAddress()); + EXPECT_EQ(reinterpret_cast(event2->getMultiRootDeviceTimestampPacketNodes()->peekNodes().at(0)->getContextEndAddress(0u)), semaphoreCmd->getSemaphoreGraphicsAddress()); } { HardwareParse csHwParser; @@ -427,9 +446,11 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW EXPECT_EQ(1u, semaphores.size()); auto semaphoreCmd = genCmdCast(*(semaphores[0])); - EXPECT_EQ(0u, semaphoreCmd->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(event1->getMultiRootDeviceTimestampPacketNodes()->peekNodes().at(0)->getContextEndAddress(0u)), semaphoreCmd->getSemaphoreGraphicsAddress()); } + event1->release(); + event2->release(); buffer->release(); } @@ -459,14 +480,14 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW char hostPtr[MemoryConstants::pageSize]{}; cl_event outputEvent2{}; - + auto currentCsUsed = pCmdQ1->getCS(0).getUsed(); pCmdQ1->enqueueReadBuffer(buffer, CL_FALSE, 0, MemoryConstants::pageSize, hostPtr, nullptr, 1, &outputEvent1, &outputEvent2); { HardwareParse csHwParser; - csHwParser.parseCommands(pCmdQ1->getCS(0)); + csHwParser.parseCommands(pCmdQ1->getCS(0), currentCsUsed); auto semaphores = findAll(csHwParser.cmdList.begin(), csHwParser.cmdList.end()); EXPECT_EQ(0u, semaphores.size()); @@ -483,7 +504,7 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW nullptr); { HardwareParse csHwParser; - csHwParser.parseCommands(pCmdQ1->getCS(0)); + csHwParser.parseCommands(pCmdQ1->getCS(0), currentCsUsed); auto semaphores = findAll(csHwParser.cmdList.begin(), csHwParser.cmdList.end()); EXPECT_EQ(0u, semaphores.size()); @@ -591,9 +612,6 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW EXPECT_EQ(0u, semaphores.size()); } userEvent1.setStatus(CL_COMPLETE); - event1->release(); - event2->release(); - event3->release(); pCmdQ1->finish(); pCmdQ2->finish(); @@ -605,7 +623,8 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW EXPECT_EQ(1u, semaphores.size()); auto semaphoreCmd = genCmdCast(*(semaphores[0])); EXPECT_EQ(1u, semaphoreCmd->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ2->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd->getSemaphoreGraphicsAddress()); + auto node = event2->getMultiRootDeviceTimestampPacketNodes()->peekNodes().at(0); + EXPECT_EQ(node->getGpuAddress() + node->getContextEndOffset(), semaphoreCmd->getSemaphoreGraphicsAddress()); } { HardwareParse csHwParser; @@ -621,8 +640,9 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW EXPECT_EQ(2u, semaphores.size()); auto semaphoreCmd0 = genCmdCast(*(semaphores[0])); - EXPECT_EQ(0u, semaphoreCmd0->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd0->getSemaphoreDataDword()); + auto node = event1->getMultiRootDeviceTimestampPacketNodes()->peekNodes().at(0); + EXPECT_EQ(node->getGpuAddress() + node->getContextEndOffset(), semaphoreCmd0->getSemaphoreGraphicsAddress()); } { HardwareParse csHwParser; @@ -631,6 +651,9 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW EXPECT_LE(1u, semaphores.size()); } + event1->release(); + event2->release(); + event3->release(); buffer->release(); pCmdQ1->release(); pCmdQ2->release(); @@ -962,3 +985,73 @@ HWTEST_F(BcsCrossDeviceMigrationTests, givenBufferWithMultiStorageWhenEnqueueRea EXPECT_EQ(buffer.get(), cmdQueue->migrateMultiGraphicsAllocationsReceivedOperationParams.srcMemObj); } + +HWTEST_F(CrossDeviceDependenciesTests, givenMultipleEventInMultiRootDeviceEnvironmentWhenTheyDoNotHaveMultiRootSyncNodeThenCsrDepsDoesNotHaveAnyMultiRootSyncContainer) { + Event event1(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 5, 15); + Event event2(nullptr, CL_COMMAND_NDRANGE_KERNEL, 6, 16); + Event event3(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 1, 6); + Event event4(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 4, 20); + Event event5(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 3, 4); + Event event6(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 2, 7); + UserEvent userEvent1(&pCmdQ1->getContext()); + UserEvent userEvent2(&pCmdQ2->getContext()); + + userEvent1.setStatus(CL_COMPLETE); + userEvent2.setStatus(CL_COMPLETE); + { + cl_event eventWaitList[] = + { + &event1, + &event2, + &event3, + &event4, + &event5, + &event6, + &userEvent1, + }; + cl_uint numEventsInWaitList = sizeof(eventWaitList) / sizeof(eventWaitList[0]); + + EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, nullptr); + CsrDependencies csrDeps; + eventsRequest.fillCsrDependenciesForRootDevices(csrDeps, pCmdQ2->getGpgpuCommandStreamReceiver()); + + EXPECT_EQ(0u, csrDeps.multiRootTimeStampSyncContainer.size()); + } +} +HWTEST_F(CrossDeviceDependenciesTests, givenMultipleEventInMultiRootDeviceEnvironmentWhenTheyDoNotHaveMultiRootSyncNodeContainersThenCsrDepsDoesNotHaveAnyMultiRootSyncContainer) { + + MockEvent event1(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 5, 15); + event1.multiRootDeviceTimestampPacketContainer.reset(new TimestampPacketContainer()); + MockEvent event2(nullptr, CL_COMMAND_NDRANGE_KERNEL, 6, 16); + MockEvent event3(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 4, 20); + event3.multiRootDeviceTimestampPacketContainer.reset(new TimestampPacketContainer()); + MockEvent event4(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 3, 4); + event4.multiRootDeviceTimestampPacketContainer.reset(new TimestampPacketContainer()); + MockEvent event5(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 3, 4); + event5.multiRootDeviceTimestampPacketContainer.reset(new TimestampPacketContainer()); + MockEvent event6(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 2, 7); + event6.multiRootDeviceTimestampPacketContainer.reset(new TimestampPacketContainer()); + UserEvent userEvent1(&pCmdQ1->getContext()); + + userEvent1.setStatus(CL_COMPLETE); + + { + cl_event eventWaitList[] = + { + &event1, + &event2, + &event3, + &event4, + &event5, + &event6, + &userEvent1, + }; + cl_uint numEventsInWaitList = sizeof(eventWaitList) / sizeof(eventWaitList[0]); + + EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, nullptr); + CsrDependencies csrDeps; + eventsRequest.fillCsrDependenciesForRootDevices(csrDeps, pCmdQ2->getGpgpuCommandStreamReceiver()); + + EXPECT_EQ(0u, csrDeps.multiRootTimeStampSyncContainer.size()); + } +} diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_1_tests.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_1_tests.cpp index 4018e2ac6e..99dcefb0be 100644 --- a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_1_tests.cpp +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_1_tests.cpp @@ -12,6 +12,7 @@ #include "shared/source/helpers/blit_commands_helper.h" #include "shared/source/helpers/constants.h" #include "shared/source/helpers/logical_state_helper.h" +#include "shared/source/os_interface/device_factory.h" #include "shared/source/os_interface/hw_info_config.h" #include "shared/test/common/helpers/debug_manager_state_restore.h" #include "shared/test/common/helpers/engine_descriptor_helper.h" diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_2_tests.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_2_tests.cpp index b0bb0304ce..42758e1498 100644 --- a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_2_tests.cpp +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_2_tests.cpp @@ -1793,4 +1793,4 @@ HWTEST_F(BcsTests, givenHostPtrToImageWhenBlitBufferIsCalledThenBlitCmdIsFound) hwParser.parseCommands(csr.commandStream, 0); auto cmdIterator = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); EXPECT_NE(hwParser.cmdList.end(), cmdIterator); -} +} \ No newline at end of file diff --git a/opencl/test/unit_test/event/event_builder_tests.cpp b/opencl/test/unit_test/event/event_builder_tests.cpp index 234248a617..ff0cda16de 100644 --- a/opencl/test/unit_test/event/event_builder_tests.cpp +++ b/opencl/test/unit_test/event/event_builder_tests.cpp @@ -79,7 +79,7 @@ TEST(EventBuilder, givenVirtualEventWithCommandThenFinalizeAddChild) { public: using CommandComputeKernel::eventsWaitlist; MockCommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr &kernelOperation, std::vector &surfaces, Kernel *kernel) - : CommandComputeKernel(commandQueue, kernelOperation, surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0) {} + : CommandComputeKernel(commandQueue, kernelOperation, surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0, nullptr) {} }; auto device = std::make_unique(MockDevice::createWithNewExecutionEnvironment(defaultHwInfo.get())); @@ -129,7 +129,7 @@ TEST(EventBuilder, givenVirtualEventWithSubmittedCommandAsParentThenFinalizeNotA public: using CommandComputeKernel::eventsWaitlist; MockCommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr &kernelOperation, std::vector &surfaces, Kernel *kernel) - : CommandComputeKernel(commandQueue, kernelOperation, surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0) {} + : CommandComputeKernel(commandQueue, kernelOperation, surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0, nullptr) {} }; auto device = std::make_unique(MockDevice::createWithNewExecutionEnvironment(defaultHwInfo.get())); diff --git a/opencl/test/unit_test/event/event_tests.cpp b/opencl/test/unit_test/event/event_tests.cpp index 4d58140ed5..9ef77c60a3 100644 --- a/opencl/test/unit_test/event/event_tests.cpp +++ b/opencl/test/unit_test/event/event_tests.cpp @@ -486,7 +486,7 @@ TEST_F(InternalsEventTest, GivenSubmitCommandFalseWhenSubmittingCommandsThenRefA PreemptionMode preemptionMode = pDevice->getPreemptionMode(); v.push_back(bufferSurf); - auto cmd = new CommandComputeKernel(cmdQ, blockedCommandsData, v, false, false, false, nullptr, preemptionMode, pKernel, 1); + auto cmd = new CommandComputeKernel(cmdQ, blockedCommandsData, v, false, false, false, nullptr, preemptionMode, pKernel, 1, nullptr); event.setCommand(std::unique_ptr(cmd)); auto taskLevelBefore = csr.peekTaskLevel(); @@ -529,7 +529,7 @@ TEST_F(InternalsEventTest, GivenSubmitCommandTrueWhenSubmittingCommandsThenRefAp NullSurface *surface = new NullSurface; v.push_back(surface); PreemptionMode preemptionMode = pDevice->getPreemptionMode(); - auto cmd = new CommandComputeKernel(cmdQ, blockedCommandsData, v, false, false, false, nullptr, preemptionMode, pKernel, 1); + auto cmd = new CommandComputeKernel(cmdQ, blockedCommandsData, v, false, false, false, nullptr, preemptionMode, pKernel, 1, nullptr); event.setCommand(std::unique_ptr(cmd)); auto taskLevelBefore = csr.peekTaskLevel(); @@ -580,7 +580,7 @@ TEST_F(InternalsEventTest, givenBlockedKernelWithPrintfWhenSubmittedThenPrintOut std::vector v; PreemptionMode preemptionMode = pDevice->getPreemptionMode(); - auto cmd = new CommandComputeKernel(mockCmdQueue, blockedCommandsData, v, false, false, false, std::move(printfHandler), preemptionMode, pKernel, 1); + auto cmd = new CommandComputeKernel(mockCmdQueue, blockedCommandsData, v, false, false, false, std::move(printfHandler), preemptionMode, pKernel, 1, nullptr); event.setCommand(std::unique_ptr(cmd)); event.submitCommand(false); @@ -632,7 +632,7 @@ TEST_F(InternalsEventTest, givenGpuHangOnCmdQueueWaitFunctionAndBlockedKernelWit std::vector v; PreemptionMode preemptionMode = pDevice->getPreemptionMode(); - auto cmd = new CommandComputeKernel(mockCmdQueue, blockedCommandsData, v, false, false, false, std::move(printfHandler), preemptionMode, pKernel, 1); + auto cmd = new CommandComputeKernel(mockCmdQueue, blockedCommandsData, v, false, false, false, std::move(printfHandler), preemptionMode, pKernel, 1, nullptr); event.setCommand(std::unique_ptr(cmd)); event.submitCommand(false); @@ -681,7 +681,7 @@ TEST_F(InternalsEventTest, givenGpuHangOnPrintingEnqueueOutputAndBlockedKernelWi std::vector v; PreemptionMode preemptionMode = pDevice->getPreemptionMode(); - auto cmd = new CommandComputeKernel(mockCmdQueue, blockedCommandsData, v, false, false, false, std::move(printfHandler), preemptionMode, pKernel, 1); + auto cmd = new CommandComputeKernel(mockCmdQueue, blockedCommandsData, v, false, false, false, std::move(printfHandler), preemptionMode, pKernel, 1, nullptr); event.setCommand(std::unique_ptr(cmd)); event.submitCommand(false); @@ -1170,7 +1170,7 @@ HWTEST_F(EventTest, givenVirtualEventWhenCommandSubmittedThenLockCsrOccurs) { public: using CommandComputeKernel::eventsWaitlist; MockCommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr &kernelOperation, std::vector &surfaces, Kernel *kernel) - : CommandComputeKernel(commandQueue, kernelOperation, surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0) {} + : CommandComputeKernel(commandQueue, kernelOperation, surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0, nullptr) {} }; class MockEvent : public Event { public: @@ -1751,7 +1751,7 @@ HWTEST_F(InternalsEventTest, givenAbortedCommandWhenSubmitCalledThenDontUpdateFl blockedCommandsData->setHeaps(dsh, ioh, ssh); PreemptionMode preemptionMode = pDevice->getPreemptionMode(); std::vector v; - auto cmd = new CommandComputeKernel(*pCmdQ, blockedCommandsData, v, false, false, false, nullptr, preemptionMode, pKernel, 1); + auto cmd = new CommandComputeKernel(*pCmdQ, blockedCommandsData, v, false, false, false, nullptr, preemptionMode, pKernel, 1, nullptr); event->setCommand(std::unique_ptr(cmd)); FlushStamp expectedFlushStamp = 0; @@ -1894,3 +1894,35 @@ TEST(EventTimestampTest, givenEnableTimestampWaitWhenCheckIsTimestampWaitEnabled EXPECT_TRUE(event.isWaitForTimestampsEnabled()); } } +TEST(MultiRootEvent, givenContextWithMultiRootTagAllocatorWhenEventGetsTagThenNewAllocatorIsNotCreated) { + auto mockDevice = std::make_unique(MockDevice::createWithNewExecutionEnvironment(defaultHwInfo.get())); + MockContext context{}; + MockCommandQueue cmdQ(&context, mockDevice.get(), 0, false); + auto allocator = cmdQ.getGpgpuCommandStreamReceiver().createMultiRootDeviceTimestampPacketAllocator(context.getRootDeviceIndices()); + auto allocatorPtr = allocator.get(); + context.setMultiRootDeviceTimestampPacketAllocator(allocator); + MockEvent event{&cmdQ, CL_COMMAND_MARKER, 0, 0}; + event.getMultiRootTimestampSyncNode(); + EXPECT_EQ(allocatorPtr, context.getMultiRootDeviceTimestampPacketAllocator()); +} +TEST(MultiRootEvent, givenContextWithoutMultiRootTagAllocatorWhenEventGetsTagThenNewAllocatorIsCreated) { + auto mockDevice = std::make_unique(MockDevice::createWithNewExecutionEnvironment(defaultHwInfo.get())); + MockContext context{}; + MockCommandQueue cmdQ(&context, mockDevice.get(), 0, false); + EXPECT_EQ(context.getMultiRootDeviceTimestampPacketAllocator(), nullptr); + MockEvent event{&cmdQ, CL_COMMAND_MARKER, 0, 0}; + event.getMultiRootTimestampSyncNode(); + EXPECT_NE(context.getMultiRootDeviceTimestampPacketAllocator(), nullptr); +} +TEST(MultiRootEvent, givenEventWithTagWhenEventGetsNewTagThenNewTagContainerIsNotCreated) { + auto mockDevice = std::make_unique(MockDevice::createWithNewExecutionEnvironment(defaultHwInfo.get())); + MockContext context{}; + MockCommandQueue cmdQ(&context, mockDevice.get(), 0, false); + MockEvent event{&cmdQ, CL_COMMAND_MARKER, 0, 0}; + EXPECT_EQ(event.getMultiRootDeviceTimestampPacketNodes(), nullptr); + event.getMultiRootTimestampSyncNode(); + auto containerPtr = event.getMultiRootDeviceTimestampPacketNodes(); + EXPECT_NE(containerPtr, nullptr); + event.getMultiRootTimestampSyncNode(); + EXPECT_EQ(containerPtr, event.getMultiRootDeviceTimestampPacketNodes()); +} \ No newline at end of file diff --git a/opencl/test/unit_test/gtpin/gtpin_tests.cpp b/opencl/test/unit_test/gtpin/gtpin_tests.cpp index 98e8f404d4..df7c3da23a 100644 --- a/opencl/test/unit_test/gtpin/gtpin_tests.cpp +++ b/opencl/test/unit_test/gtpin/gtpin_tests.cpp @@ -2435,7 +2435,7 @@ HWTEST_F(GTPinTests, givenGtPinInitializedWhenSubmittingKernelCommandThenFlushed gtpinNotifyKernelSubmit(kernel.mockMultiDeviceKernel, mockCmdQ.get()); - std::unique_ptr command(new CommandComputeKernel(*mockCmdQ, kernelOperation, surfaces, flushDC, slmUsed, ndRangeKernel, nullptr, preemptionMode, kernel, 1)); + std::unique_ptr command(new CommandComputeKernel(*mockCmdQ, kernelOperation, surfaces, flushDC, slmUsed, ndRangeKernel, nullptr, preemptionMode, kernel, 1, nullptr)); CompletionStamp stamp = command->submit(20, false); ASSERT_EQ(1u, kernelExecQueue.size()); diff --git a/opencl/test/unit_test/helpers/task_information_tests.cpp b/opencl/test/unit_test/helpers/task_information_tests.cpp index b4489548af..302f090012 100644 --- a/opencl/test/unit_test/helpers/task_information_tests.cpp +++ b/opencl/test/unit_test/helpers/task_information_tests.cpp @@ -154,7 +154,7 @@ TEST(CommandTest, givenWaitlistRequestWhenCommandComputeKernelIsCreatedThenMakeL public: using CommandComputeKernel::eventsWaitlist; MockCommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr &kernelOperation, std::vector &surfaces, Kernel *kernel) - : CommandComputeKernel(commandQueue, kernelOperation, surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0) {} + : CommandComputeKernel(commandQueue, kernelOperation, surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0, nullptr) {} }; auto device = std::make_unique(MockDevice::createWithNewExecutionEnvironment(defaultHwInfo.get())); @@ -291,7 +291,7 @@ HWTEST_F(DispatchFlagsTests, givenCommandComputeKernelWhenSubmitThenPassCorrectD for (auto &surface : surfaces) { requiresCoherency |= surface->IsCoherent; } - std::unique_ptr command(new CommandComputeKernel(*mockCmdQ, kernelOperation, surfaces, flushDC, slmUsed, ndRangeKernel, nullptr, preemptionMode, kernel, 1)); + std::unique_ptr command(new CommandComputeKernel(*mockCmdQ, kernelOperation, surfaces, flushDC, slmUsed, ndRangeKernel, nullptr, preemptionMode, kernel, 1, nullptr)); command->submit(20, false); EXPECT_FALSE(mockCsr->passedDispatchFlags.pipelineSelectArgs.systolicPipelineSelectMode); @@ -339,7 +339,7 @@ HWTEST_F(DispatchFlagsTests, givenClCommandCopyImageWhenSubmitThenFlushTextureCa for (auto &surface : surfaces) { requiresCoherency |= surface->IsCoherent; } - std::unique_ptr command(new CommandComputeKernel(*mockCmdQ, kernelOperation, surfaces, flushDC, slmUsed, commandType, nullptr, preemptionMode, kernel, 1)); + std::unique_ptr command(new CommandComputeKernel(*mockCmdQ, kernelOperation, surfaces, flushDC, slmUsed, commandType, nullptr, preemptionMode, kernel, 1, nullptr)); command->submit(20, false); EXPECT_FALSE(mockCsr->passedDispatchFlags.pipelineSelectArgs.systolicPipelineSelectMode); @@ -425,7 +425,7 @@ HWTEST_F(DispatchFlagsTests, givenCommandComputeKernelWhenSubmitThenPassCorrectD bool flushDC = false; bool slmUsed = false; bool ndRangeKernel = false; - std::unique_ptr command(new CommandComputeKernel(*mockCmdQ, kernelOperation, surfaces, flushDC, slmUsed, ndRangeKernel, nullptr, preemptionMode, kernel, 1)); + std::unique_ptr command(new CommandComputeKernel(*mockCmdQ, kernelOperation, surfaces, flushDC, slmUsed, ndRangeKernel, nullptr, preemptionMode, kernel, 1, nullptr)); command->submit(20, false); EXPECT_TRUE(mockCsr->passedDispatchFlags.epilogueRequired); diff --git a/opencl/test/unit_test/helpers/timestamp_packet_1_tests.cpp b/opencl/test/unit_test/helpers/timestamp_packet_1_tests.cpp index 1abdebb924..5d1fdd51b3 100644 --- a/opencl/test/unit_test/helpers/timestamp_packet_1_tests.cpp +++ b/opencl/test/unit_test/helpers/timestamp_packet_1_tests.cpp @@ -34,11 +34,11 @@ HWCMDTEST_F(IGFX_GEN8_CORE, TimestampPacketTests, givenTimestampPacketWriteEnabl auto mockCmdQHw = std::make_unique>(context, device.get(), nullptr); device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = false; - getCommandStream(*mockCmdQHw, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false); + getCommandStream(*mockCmdQHw, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false, nullptr); auto sizeWithDisabled = mockCmdQHw->requestedCmdStreamSize; device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; - getCommandStream(*mockCmdQHw, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false); + getCommandStream(*mockCmdQHw, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false, nullptr); auto sizeWithEnabled = mockCmdQHw->requestedCmdStreamSize; auto extendedSize = sizeWithDisabled + sizeof(typename FamilyType::PIPE_CONTROL); @@ -52,7 +52,7 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledAndOoqWhenEstimat device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = false; getCommandStream(*mockCmdQHw, CsrDependencies(), false, false, - false, multiDispatchInfo, nullptr, 0, false, false); + false, multiDispatchInfo, nullptr, 0, false, false, nullptr); auto sizeWithDisabled = mockCmdQHw->requestedCmdStreamSize; device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; @@ -82,7 +82,7 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledAndOoqWhenEstimat eventsRequest.fillCsrDependenciesForTimestampPacketContainer( csrDeps, device->getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr); - getCommandStream(*mockCmdQHw, csrDeps, false, false, false, multiDispatchInfo, nullptr, 0, false, false); + getCommandStream(*mockCmdQHw, csrDeps, false, false, false, multiDispatchInfo, nullptr, 0, false, false, nullptr); auto sizeWithEnabled = mockCmdQHw->requestedCmdStreamSize; size_t sizeForNodeDependency = 0; @@ -143,7 +143,7 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledWhenEstimatingStr auto mockCmdQHw = std::make_unique>(context, device.get(), nullptr); device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = false; - getCommandStream(*mockCmdQHw, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false); + getCommandStream(*mockCmdQHw, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false, nullptr); auto sizeWithDisabled = mockCmdQHw->requestedCmdStreamSize; device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; @@ -172,7 +172,7 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledWhenEstimatingStr CsrDependencies csrDeps; eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, device->getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr); - getCommandStream(*mockCmdQHw, csrDeps, false, false, false, multiDispatchInfo, nullptr, 0, false, false); + getCommandStream(*mockCmdQHw, csrDeps, false, false, false, multiDispatchInfo, nullptr, 0, false, false, nullptr); auto sizeWithEnabled = mockCmdQHw->requestedCmdStreamSize; size_t sizeForNodeDependency = 0; diff --git a/opencl/test/unit_test/kernel/kernel_cache_flush_requirements_tests.cpp b/opencl/test/unit_test/kernel/kernel_cache_flush_requirements_tests.cpp index c983c1876b..230f9c65d9 100644 --- a/opencl/test/unit_test/kernel/kernel_cache_flush_requirements_tests.cpp +++ b/opencl/test/unit_test/kernel/kernel_cache_flush_requirements_tests.cpp @@ -213,7 +213,7 @@ HWTEST2_F(KernelWithCacheFlushTests, givenCacheFlushRequiredWhenEstimatingThenAd { EXPECT_FALSE(mockKernel->mockKernel->Kernel::requiresCacheFlushCommand(*cmdQ)); - initialSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, csrDeps, false, false, false, *cmdQ, multiDispatchInfo, false, false); + initialSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, csrDeps, false, false, false, *cmdQ, multiDispatchInfo, false, false, nullptr); } { @@ -227,7 +227,7 @@ HWTEST2_F(KernelWithCacheFlushTests, givenCacheFlushRequiredWhenEstimatingThenAd ultCsr.multiOsContextCapable = false; EXPECT_TRUE(mockKernel->mockKernel->Kernel::requiresCacheFlushCommand(*cmdQ)); - sizeWithCacheFlush = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, csrDeps, false, false, false, *cmdQ, multiDispatchInfo, false, false); + sizeWithCacheFlush = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, csrDeps, false, false, false, *cmdQ, multiDispatchInfo, false, false, nullptr); } EXPECT_EQ(initialSize + expectedDiff, sizeWithCacheFlush); diff --git a/opencl/test/unit_test/kernel/kernel_tests.cpp b/opencl/test/unit_test/kernel/kernel_tests.cpp index 4bf2b03166..3cddb38da9 100644 --- a/opencl/test/unit_test/kernel/kernel_tests.cpp +++ b/opencl/test/unit_test/kernel/kernel_tests.cpp @@ -505,6 +505,7 @@ class CommandStreamReceiverMock : public CommandStreamReceiver { using BaseClass::CommandStreamReceiver; TagAllocatorBase *getTimestampPacketAllocator() override { return nullptr; } + std::unique_ptr createMultiRootDeviceTimestampPacketAllocator(const RootDeviceIndicesContainer rootDeviceIndices) override { return std::unique_ptr(nullptr); } SubmissionStatus flushTagUpdate() override { return SubmissionStatus::SUCCESS; }; void updateTagFromWait() override{}; diff --git a/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp b/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp index b01b706520..c4b1a4c128 100644 --- a/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp +++ b/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp @@ -767,11 +767,11 @@ HWTEST_TEMPLATED_F(BcsBufferTests, givenBufferOperationWithoutKernelWhenEstimati auto &hwInfo = cmdQ->getDevice().getHardwareInfo(); auto readBufferCmdsSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_READ_BUFFER, csrDependencies, false, false, - true, *cmdQ, multiDispatchInfo, false, false); + true, *cmdQ, multiDispatchInfo, false, false, nullptr); auto writeBufferCmdsSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_WRITE_BUFFER, csrDependencies, false, false, - true, *cmdQ, multiDispatchInfo, false, false); + true, *cmdQ, multiDispatchInfo, false, false, nullptr); auto copyBufferCmdsSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_COPY_BUFFER, csrDependencies, false, false, - true, *cmdQ, multiDispatchInfo, false, false); + true, *cmdQ, multiDispatchInfo, false, false, nullptr); auto expectedSize = TimestampPacketHelper::getRequiredCmdStreamSizeForNodeDependencyWithBlitEnqueue(); if (cmdQ->isCacheFlushForBcsRequired()) { diff --git a/opencl/test/unit_test/mocks/mock_command_queue.h b/opencl/test/unit_test/mocks/mock_command_queue.h index 553b0f9397..6a501e2546 100644 --- a/opencl/test/unit_test/mocks/mock_command_queue.h +++ b/opencl/test/unit_test/mocks/mock_command_queue.h @@ -249,6 +249,7 @@ class MockCommandQueueHw : public CommandQueueHw { using BaseClass::latestSentEnqueueType; using BaseClass::obtainCommandStream; using BaseClass::obtainNewTimestampPacketNodes; + using BaseClass::processDispatchForKernels; using BaseClass::requiresCacheFlushAfterWalker; using BaseClass::throttle; using BaseClass::timestampPacketContainer; diff --git a/opencl/test/unit_test/mocks/mock_event.h b/opencl/test/unit_test/mocks/mock_event.h index bde34c8904..6636dd9362 100644 --- a/opencl/test/unit_test/mocks/mock_event.h +++ b/opencl/test/unit_test/mocks/mock_event.h @@ -39,6 +39,7 @@ struct MockEvent : public BaseEventType { using Event::calculateSubmitTimestampData; using Event::isWaitForTimestampsEnabled; using Event::magic; + using Event::multiRootDeviceTimestampPacketContainer; using Event::queueTimeStamp; using Event::submitTimeStamp; using Event::timestampPacketContainer; diff --git a/opencl/test/unit_test/profiling/profiling_tests.cpp b/opencl/test/unit_test/profiling/profiling_tests.cpp index 92beea833f..7dec14a546 100644 --- a/opencl/test/unit_test/profiling/profiling_tests.cpp +++ b/opencl/test/unit_test/profiling/profiling_tests.cpp @@ -71,13 +71,13 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GivenCommandQueueWithProfilingAndFor MultiDispatchInfo multiDispatchInfo(&kernel); auto &commandStreamNDRangeKernel = getCommandStream(*pCmdQ, CsrDependencies(), true, false, false, - multiDispatchInfo, nullptr, 0, false, false); + multiDispatchInfo, nullptr, 0, false, false, nullptr); auto expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, true, false, *pCmdQ, &kernel, {}); EXPECT_GE(expectedSizeCS, requiredSize); EXPECT_GE(commandStreamNDRangeKernel.getAvailableSpace(), requiredSize); auto &commandStreamTask = getCommandStream(*pCmdQ, CsrDependencies(), true, false, false, - multiDispatchInfo, nullptr, 0, false, false); + multiDispatchInfo, nullptr, 0, false, false, nullptr); expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_TASK, true, false, *pCmdQ, &kernel, {}); EXPECT_GE(expectedSizeCS, requiredSize); EXPECT_GE(commandStreamTask.getAvailableSpace(), requiredSize); @@ -93,13 +93,13 @@ HWTEST_F(ProfilingTests, GivenCommandQueueWithProfilingAndForWorkloadWithNoKerne MultiDispatchInfo multiDispatchInfo(nullptr); auto &commandStreamMigrateMemObjects = getCommandStream(*pCmdQ, CsrDependencies(), true, false, false, - multiDispatchInfo, nullptr, 0, false, false); + multiDispatchInfo, nullptr, 0, false, false, nullptr); auto expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_MIGRATE_MEM_OBJECTS, true, false, *pCmdQ, nullptr, {}); EXPECT_GE(expectedSizeCS, requiredSize); EXPECT_GE(commandStreamMigrateMemObjects.getAvailableSpace(), requiredSize); auto &commandStreamMarker = getCommandStream(*pCmdQ, CsrDependencies(), true, - false, false, multiDispatchInfo, nullptr, 0, false, false); + false, false, multiDispatchInfo, nullptr, 0, false, false, nullptr); expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_MARKER, true, false, *pCmdQ, nullptr, {}); EXPECT_GE(expectedSizeCS, requiredSize); EXPECT_GE(commandStreamMarker.getAvailableSpace(), requiredSize); @@ -121,9 +121,9 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GivenCommandQueueWithProfilingAndFor multiDispatchInfo.push(dispatchInfo); multiDispatchInfo.push(dispatchInfo); auto &commandStreamTask = getCommandStream(*pCmdQ, CsrDependencies(), true, false, false, - multiDispatchInfo, nullptr, 0, false, false); + multiDispatchInfo, nullptr, 0, false, false, nullptr); auto expectedSizeCS = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_TASK, CsrDependencies(), true, false, - false, *pCmdQ, multiDispatchInfo, false, false); + false, *pCmdQ, multiDispatchInfo, false, false, nullptr); EXPECT_GE(expectedSizeCS, requiredSize); EXPECT_GE(commandStreamTask.getAvailableSpace(), requiredSize); } @@ -741,13 +741,13 @@ HWTEST_F(ProfilingWithPerfCountersTests, GivenCommandQueueWithProfilingPerfCount MultiDispatchInfo multiDispatchInfo(nullptr); auto &commandStreamMigrateMemObjects = getCommandStream(*pCmdQ, CsrDependencies(), true, true, false, multiDispatchInfo, - nullptr, 0, false, false); + nullptr, 0, false, false, nullptr); auto expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_MIGRATE_MEM_OBJECTS, true, true, *pCmdQ, nullptr, {}); EXPECT_GE(expectedSizeCS, requiredSize); EXPECT_GE(commandStreamMigrateMemObjects.getAvailableSpace(), requiredSize); auto &commandStreamMarker = getCommandStream(*pCmdQ, CsrDependencies(), true, true, false, - multiDispatchInfo, nullptr, 0, false, false); + multiDispatchInfo, nullptr, 0, false, false, nullptr); expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_MARKER, true, true, *pCmdQ, nullptr, {}); EXPECT_GE(expectedSizeCS, requiredSize); EXPECT_GE(commandStreamMarker.getAvailableSpace(), requiredSize); diff --git a/opencl/test/unit_test/xe_hpc_core/pvc/command_stream_receiver_hw_tests_pvc.cpp b/opencl/test/unit_test/xe_hpc_core/pvc/command_stream_receiver_hw_tests_pvc.cpp index 4d23062f48..ddef4dc98d 100644 --- a/opencl/test/unit_test/xe_hpc_core/pvc/command_stream_receiver_hw_tests_pvc.cpp +++ b/opencl/test/unit_test/xe_hpc_core/pvc/command_stream_receiver_hw_tests_pvc.cpp @@ -13,6 +13,7 @@ #include "shared/source/helpers/engine_node_helper.h" #include "shared/source/helpers/preamble.h" #include "shared/source/os_interface/device_factory.h" +#include "shared/source/utilities/tag_allocator.h" #include "shared/source/xe_hpc_core/hw_cmds_pvc.h" #include "shared/test/common/helpers/debug_manager_state_restore.h" #include "shared/test/common/helpers/variable_backup.h" @@ -243,10 +244,14 @@ PVCTEST_F(PvcMultiRootDeviceCommandStreamReceiverBufferTests, givenMultipleEvent MockGraphicsAllocation svmAlloc(svmPtr, svmSize); Event event1(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 5, 15); + auto node1 = event1.getMultiRootTimestampSyncNode(); Event event2(nullptr, CL_COMMAND_NDRANGE_KERNEL, 6, 16); Event event3(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 4, 20); + auto node3 = event3.getMultiRootTimestampSyncNode(); Event event4(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 3, 4); + auto node4 = event4.getMultiRootTimestampSyncNode(); Event event5(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 2, 7); + auto node5 = event5.getMultiRootTimestampSyncNode(); UserEvent userEvent1(&pCmdQ1->getContext()); UserEvent userEvent2(&pCmdQ2->getContext()); @@ -285,12 +290,12 @@ PVCTEST_F(PvcMultiRootDeviceCommandStreamReceiverBufferTests, givenMultipleEvent EXPECT_EQ(3u, semaphores.size()); auto semaphoreCmd0 = genCmdCast(*(semaphores[0])); - EXPECT_EQ(4u, semaphoreCmd0->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ2->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd0->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(node4->getContextEndAddress(0u)), semaphoreCmd0->getSemaphoreGraphicsAddress()); auto semaphoreCmd1 = genCmdCast(*(semaphores[1])); - EXPECT_EQ(7u, semaphoreCmd1->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ2->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd1->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd1->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(node5->getContextEndAddress(0u)), semaphoreCmd1->getSemaphoreGraphicsAddress()); } { @@ -313,12 +318,12 @@ PVCTEST_F(PvcMultiRootDeviceCommandStreamReceiverBufferTests, givenMultipleEvent EXPECT_EQ(3u, semaphores.size()); auto semaphoreCmd0 = genCmdCast(*(semaphores[0])); - EXPECT_EQ(15u, semaphoreCmd0->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd0->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(node1->getContextEndAddress(0u)), semaphoreCmd0->getSemaphoreGraphicsAddress()); auto semaphoreCmd1 = genCmdCast(*(semaphores[1])); - EXPECT_EQ(20u, semaphoreCmd1->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd1->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd1->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(node3->getContextEndAddress(0u)), semaphoreCmd1->getSemaphoreGraphicsAddress()); } alignedFree(svmPtr); } diff --git a/shared/source/command_stream/command_stream_receiver.h b/shared/source/command_stream/command_stream_receiver.h index f60549b24d..1753fa6323 100644 --- a/shared/source/command_stream/command_stream_receiver.h +++ b/shared/source/command_stream/command_stream_receiver.h @@ -220,6 +220,7 @@ class CommandStreamReceiver { TagAllocatorBase *getEventTsAllocator(); TagAllocatorBase *getEventPerfCountAllocator(const uint32_t tagSize); virtual TagAllocatorBase *getTimestampPacketAllocator() = 0; + virtual std::unique_ptr createMultiRootDeviceTimestampPacketAllocator(const RootDeviceIndicesContainer rootDeviceIndices) = 0; virtual bool expectMemory(const void *gfxAddress, const void *srcAddress, size_t length, uint32_t compareOperation); diff --git a/shared/source/command_stream/command_stream_receiver_hw.h b/shared/source/command_stream/command_stream_receiver_hw.h index 7aab4233e8..ff16a0835d 100644 --- a/shared/source/command_stream/command_stream_receiver_hw.h +++ b/shared/source/command_stream/command_stream_receiver_hw.h @@ -130,6 +130,7 @@ class CommandStreamReceiverHw : public CommandStreamReceiver { GraphicsAllocation *getClearColorAllocation() override; TagAllocatorBase *getTimestampPacketAllocator() override; + std::unique_ptr createMultiRootDeviceTimestampPacketAllocator(const RootDeviceIndicesContainer rootDeviceIndices) override; void postInitFlagsSetup() override; void programActivePartitionConfig(LinearStream &csr); diff --git a/shared/source/command_stream/command_stream_receiver_hw_base.inl b/shared/source/command_stream/command_stream_receiver_hw_base.inl index 6b0a19e926..a1fea16353 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_base.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_base.inl @@ -409,7 +409,7 @@ CompletionStamp CommandStreamReceiverHw::flushTask( auto commandStreamStartCSR = commandStreamCSR.getUsed(); TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer(commandStreamCSR, dispatchFlags.csrDependencies); - TimestampPacketHelper::programCsrDependenciesForForTaskCountContainer(commandStreamCSR, dispatchFlags.csrDependencies); + TimestampPacketHelper::programCsrDependenciesForForMultiRootDeviceSyncContainer(commandStreamCSR, dispatchFlags.csrDependencies); programActivePartitionConfigFlushTask(commandStreamCSR); programEngineModeCommands(commandStreamCSR, dispatchFlags); @@ -982,7 +982,7 @@ size_t CommandStreamReceiverHw::getRequiredCmdStreamSize(const Dispat } size += TimestampPacketHelper::getRequiredCmdStreamSize(dispatchFlags.csrDependencies); - size += TimestampPacketHelper::getRequiredCmdStreamSizeForTaskCountContainer(dispatchFlags.csrDependencies); + size += TimestampPacketHelper::getRequiredCmdStreamSizeForMultiRootDeviceSyncNodesContainer(dispatchFlags.csrDependencies); size += EncodeKernelArgsBuffer::getKernelArgsBufferCmdsSize(kernelArgsBufferAllocation, logicalStateHelper.get()); @@ -1198,7 +1198,7 @@ TaskCountType CommandStreamReceiverHw::flushBcsTask(const BlitPropert for (auto &blitProperties : blitPropertiesContainer) { TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer(commandStream, blitProperties.csrDependencies); - TimestampPacketHelper::programCsrDependenciesForForTaskCountContainer(commandStream, blitProperties.csrDependencies); + TimestampPacketHelper::programCsrDependenciesForForMultiRootDeviceSyncContainer(commandStream, blitProperties.csrDependencies); BlitCommandsHelper::encodeWa(commandStream, blitProperties, latestSentBcsWaValue); @@ -1231,6 +1231,12 @@ TaskCountType CommandStreamReceiverHw::flushBcsTask(const BlitPropert if (blitProperties.clearColorAllocation) { makeResident(*blitProperties.clearColorAllocation); } + if (blitProperties.multiRootDeviceEventSync != nullptr) { + MiFlushArgs args; + args.commandWithPostSync = true; + args.notifyEnable = isUsedNotifyEnableForPostSync(); + EncodeMiFlushDW::programMiFlushDw(commandStream, blitProperties.multiRootDeviceEventSync->getGpuAddress() + blitProperties.multiRootDeviceEventSync->getContextEndOffset(), std::numeric_limits::max(), args, hwInfo); + } } BlitCommandsHelper::programGlobalSequencerFlush(commandStream); @@ -1247,7 +1253,6 @@ TaskCountType CommandStreamReceiverHw::flushBcsTask(const BlitPropert MemorySynchronizationCommands::addAdditionalSynchronization(commandStream, tagAllocation->getGpuAddress(), false, peekHwInfo()); } - if (PauseOnGpuProperties::pauseModeAllowed(DebugManager.flags.PauseOnBlitCopy.get(), taskCount, PauseOnGpuProperties::PauseMode::AfterWorkload)) { BlitCommandsHelper::dispatchDebugPauseCommands(commandStream, getDebugPauseStateGPUAddress(), DebugPauseState::waitingForUserEndConfirmation, @@ -1524,6 +1529,11 @@ TagAllocatorBase *CommandStreamReceiverHw::getTimestampPacketAllocato return timestampPacketAllocator.get(); } +template +std::unique_ptr CommandStreamReceiverHw::createMultiRootDeviceTimestampPacketAllocator(const RootDeviceIndicesContainer rootDeviceIndices) { + auto &gfxCoreHelper = getGfxCoreHelper(); + return gfxCoreHelper.createTimestampPacketAllocator(rootDeviceIndices, getMemoryManager(), getPreferredTagPoolSize(), getType(), osContext->getDeviceBitfield()); +} template void CommandStreamReceiverHw::postInitFlagsSetup() { useNewResourceImplicitFlush = checkPlatformSupportsNewResourceImplicitFlush(); diff --git a/shared/source/command_stream/csr_deps.cpp b/shared/source/command_stream/csr_deps.cpp index ba0429cec6..0ae2ab90fb 100644 --- a/shared/source/command_stream/csr_deps.cpp +++ b/shared/source/command_stream/csr_deps.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2022 Intel Corporation + * Copyright (C) 2020-2023 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -22,4 +22,10 @@ void CsrDependencies::copyNodesToNewContainer(TimestampPacketContainer &newTimes newTimestampPacketContainer.assignAndIncrementNodesRefCounts(*timestampPacketContainer); } } +void CsrDependencies::copyRootDeviceSyncNodesToNewContainer(TimestampPacketContainer &newTimestampPacketContainer) { + for (auto ×tampPacketContainer : multiRootTimeStampSyncContainer) { + newTimestampPacketContainer.assignAndIncrementNodesRefCounts(*timestampPacketContainer); + } +} + } // namespace NEO diff --git a/shared/source/command_stream/csr_deps.h b/shared/source/command_stream/csr_deps.h index eab7f8d115..a0cf5bcc68 100644 --- a/shared/source/command_stream/csr_deps.h +++ b/shared/source/command_stream/csr_deps.h @@ -22,10 +22,11 @@ class CsrDependencies { All }; - StackVec, 32> taskCountContainer; + StackVec multiRootTimeStampSyncContainer; StackVec timestampPacketContainer; void makeResident(CommandStreamReceiver &commandStreamReceiver) const; void copyNodesToNewContainer(TimestampPacketContainer &newTimestampPacketContainer); + void copyRootDeviceSyncNodesToNewContainer(TimestampPacketContainer &newTimestampPacketContainer); }; } // namespace NEO diff --git a/shared/source/helpers/blit_commands_helper_base.inl b/shared/source/helpers/blit_commands_helper_base.inl index 9fc8a7b68a..6e5177e738 100644 --- a/shared/source/helpers/blit_commands_helper_base.inl +++ b/shared/source/helpers/blit_commands_helper_base.inl @@ -127,7 +127,7 @@ size_t BlitCommandsHelper::estimateBlitCommandSize(const Vec3 sizePerBlit += estimatePostBlitCommandSize(); return TimestampPacketHelper::getRequiredCmdStreamSize(csrDependencies) + - TimestampPacketHelper::getRequiredCmdStreamSizeForTaskCountContainer(csrDependencies) + + TimestampPacketHelper::getRequiredCmdStreamSizeForMultiRootDeviceSyncNodesContainer(csrDependencies) + (sizePerBlit * nBlits) + timestampCmdSize + estimatePreBlitCommandSize(); @@ -143,6 +143,9 @@ size_t BlitCommandsHelper::estimateBlitCommandsSize(const BlitPropert auto isImage = blitProperties.isImageOperation(); size += BlitCommandsHelper::estimateBlitCommandSize(blitProperties.copySize, blitProperties.csrDependencies, updateTimestampPacket, profilingEnabled, isImage, rootDeviceEnvironment, blitProperties.isSystemMemoryPoolUsed); + if (blitProperties.multiRootDeviceEventSync != nullptr) { + size += EncodeMiFlushDW::getMiFlushDwCmdSizeForDataWrite(); + } } size += BlitCommandsHelper::getWaCmdsSize(blitPropertiesContainer); size += 2 * MemorySynchronizationCommands::getSizeForAdditonalSynchronization(*rootDeviceEnvironment.getHardwareInfo()); diff --git a/shared/source/helpers/blit_properties.cpp b/shared/source/helpers/blit_properties.cpp index c5ca44600c..5120d8e6da 100644 --- a/shared/source/helpers/blit_properties.cpp +++ b/shared/source/helpers/blit_properties.cpp @@ -43,6 +43,7 @@ BlitProperties BlitProperties::constructPropertiesForReadWrite(BlitterConstants: BlitterConstants::BlitDirection::HostPtrToImage == blitDirection) { return { nullptr, // outputTimestampPacket + nullptr, // multiRootDeviceEventSync blitDirection, // blitDirection {}, // csrDependencies AuxTranslationDirection::None, // auxTranslationDirection @@ -66,6 +67,7 @@ BlitProperties BlitProperties::constructPropertiesForReadWrite(BlitterConstants: } else { return { nullptr, // outputTimestampPacket + nullptr, // multiRootDeviceEventSync blitDirection, // blitDirection {}, // csrDependencies AuxTranslationDirection::None, // auxTranslationDirection @@ -97,6 +99,7 @@ BlitProperties BlitProperties::constructPropertiesForCopy(GraphicsAllocation *ds return { nullptr, // outputTimestampPacket + nullptr, // multiRootDeviceEventSync BlitterConstants::BlitDirection::BufferToBuffer, // blitDirection {}, // csrDependencies AuxTranslationDirection::None, // auxTranslationDirection @@ -121,6 +124,7 @@ BlitProperties BlitProperties::constructPropertiesForAuxTranslation(AuxTranslati auto allocationSize = allocation->getUnderlyingBufferSize(); return { nullptr, // outputTimestampPacket + nullptr, // multiRootDeviceEventSync BlitterConstants::BlitDirection::BufferToBuffer, // blitDirection {}, // csrDependencies auxTranslationDirection, // auxTranslationDirection diff --git a/shared/source/helpers/blit_properties.h b/shared/source/helpers/blit_properties.h index 20215fbc5d..8a24756f66 100644 --- a/shared/source/helpers/blit_properties.h +++ b/shared/source/helpers/blit_properties.h @@ -49,6 +49,7 @@ struct BlitProperties { CommandStreamReceiver &gpguCsr, CommandStreamReceiver &bcsCsr); TagNodeBase *outputTimestampPacket = nullptr; + TagNodeBase *multiRootDeviceEventSync = nullptr; BlitterConstants::BlitDirection blitDirection; CsrDependencies csrDependencies; AuxTranslationDirection auxTranslationDirection; diff --git a/shared/source/helpers/timestamp_packet.h b/shared/source/helpers/timestamp_packet.h index 5a63df976e..ab33afa42d 100644 --- a/shared/source/helpers/timestamp_packet.h +++ b/shared/source/helpers/timestamp_packet.h @@ -125,17 +125,11 @@ struct TimestampPacketHelper { } template - static void programCsrDependenciesForForTaskCountContainer(LinearStream &cmdStream, const CsrDependencies &csrDependencies) { - auto &taskCountContainer = csrDependencies.taskCountContainer; - - for (auto &[taskCountPreviousRootDevice, tagAddressPreviousRootDevice] : taskCountContainer) { - using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION; - using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT; - - EncodeSempahore::addMiSemaphoreWaitCommand(cmdStream, - static_cast(tagAddressPreviousRootDevice), - static_cast(taskCountPreviousRootDevice), - COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD); + static void programCsrDependenciesForForMultiRootDeviceSyncContainer(LinearStream &cmdStream, const CsrDependencies &csrDependencies) { + for (auto timestampPacketContainer : csrDependencies.multiRootTimeStampSyncContainer) { + for (auto &node : timestampPacketContainer->peekNodes()) { + TimestampPacketHelper::programSemaphore(cmdStream, *node); + } } } @@ -199,8 +193,8 @@ struct TimestampPacketHelper { } template - static size_t getRequiredCmdStreamSizeForTaskCountContainer(const CsrDependencies &csrDependencies) { - return csrDependencies.taskCountContainer.size() * sizeof(typename GfxFamily::MI_SEMAPHORE_WAIT); + static size_t getRequiredCmdStreamSizeForMultiRootDeviceSyncNodesContainer(const CsrDependencies &csrDependencies) { + return csrDependencies.multiRootTimeStampSyncContainer.size() * sizeof(typename GfxFamily::MI_SEMAPHORE_WAIT); } }; diff --git a/shared/test/common/helpers/memory_management.h b/shared/test/common/helpers/memory_management.h index cc23e87095..55651ad186 100644 --- a/shared/test/common/helpers/memory_management.h +++ b/shared/test/common/helpers/memory_management.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2022 Intel Corporation + * Copyright (C) 2018-2023 Intel Corporation * * SPDX-License-Identifier: MIT * diff --git a/shared/test/common/mocks/mock_command_stream_receiver.h b/shared/test/common/mocks/mock_command_stream_receiver.h index b83ede247f..4d7b93e4f2 100644 --- a/shared/test/common/mocks/mock_command_stream_receiver.h +++ b/shared/test/common/mocks/mock_command_stream_receiver.h @@ -19,6 +19,7 @@ #include "shared/source/memory_manager/graphics_allocation.h" #include "shared/source/memory_manager/surface.h" #include "shared/source/os_interface/os_context.h" +#include "shared/source/utilities/tag_allocator.h" #include "shared/test/common/helpers/dispatch_flags_helper.h" #include @@ -99,6 +100,7 @@ class MockCommandStreamReceiver : public CommandStreamReceiver { }; TagAllocatorBase *getTimestampPacketAllocator() override { return nullptr; } + std::unique_ptr createMultiRootDeviceTimestampPacketAllocator(const RootDeviceIndicesContainer rootDeviceIndices) override { return std::unique_ptr(nullptr); } CompletionStamp flushTask( LinearStream &commandStream, diff --git a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp index db2db27535..46b8ae553c 100644 --- a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp +++ b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp @@ -15,10 +15,13 @@ #include "shared/source/helpers/api_specific_config.h" #include "shared/source/memory_manager/internal_allocation_storage.h" #include "shared/source/memory_manager/surface.h" +#include "shared/source/memory_manager/unified_memory_manager.h" #include "shared/source/os_interface/device_factory.h" #include "shared/source/os_interface/hw_info_config.h" #include "shared/source/os_interface/os_interface.h" #include "shared/source/utilities/tag_allocator.h" +#include "shared/test/common/cmd_parse/gen_cmd_parse.h" +#include "shared/test/common/cmd_parse/hw_parse.h" #include "shared/test/common/fixtures/command_stream_receiver_fixture.inl" #include "shared/test/common/fixtures/device_fixture.h" #include "shared/test/common/helpers/batch_buffer_helper.h" @@ -33,6 +36,7 @@ #include "shared/test/common/mocks/mock_execution_environment.h" #include "shared/test/common/mocks/mock_internal_allocation_storage.h" #include "shared/test/common/mocks/mock_memory_manager.h" +#include "shared/test/common/mocks/mock_timestamp_container.h" #include "shared/test/common/mocks/ult_device_factory.h" #include "shared/test/common/test_macros/hw_test.h" #include "shared/test/common/test_macros/test_checks_shared.h" @@ -2477,3 +2481,89 @@ HWTEST_F(CommandStreamReceiverHwTest, givenVariousCsrModeWhenGettingTbxModeThenE ultCsr.commandStreamReceiverType = CommandStreamReceiverType::CSR_TBX_WITH_AUB; EXPECT_TRUE(ultCsr.isTbxMode()); } + +HWTEST_F(CommandStreamReceiverHwTest, GivenTwoRootDevicesWhengetMultiRootDeviceTimestampPacketAllocatorCalledThenAllocatorForTwoDevicesCreated) { + auto executionEnvironment = std::make_unique(defaultHwInfo.get(), true, 2u); + auto devices = DeviceFactory::createDevices(*executionEnvironment.release()); + const RootDeviceIndicesContainer indices = {0u, 1u}; + auto csr = devices[0]->getDefaultEngine().commandStreamReceiver; + auto allocator = csr->createMultiRootDeviceTimestampPacketAllocator(indices); + class MockTagAllocatorBase : public TagAllocatorBase { + public: + using TagAllocatorBase::maxRootDeviceIndex; + }; + EXPECT_EQ(reinterpret_cast(allocator.get())->maxRootDeviceIndex, 1u); +} +HWTEST_F(CommandStreamReceiverHwTest, GivenFiveRootDevicesWhengetMultiRootDeviceTimestampPacketAllocatorCalledThenAllocatorForFiveDevicesCreated) { + auto executionEnvironment = std::make_unique(defaultHwInfo.get(), true, 4u); + auto devices = DeviceFactory::createDevices(*executionEnvironment.release()); + const RootDeviceIndicesContainer indices = {0u, 1u, 2u, 3u}; + auto csr = devices[0]->getDefaultEngine().commandStreamReceiver; + auto allocator = csr->createMultiRootDeviceTimestampPacketAllocator(indices); + class MockTagAllocatorBase : public TagAllocatorBase { + public: + using TagAllocatorBase::maxRootDeviceIndex; + }; + EXPECT_EQ(reinterpret_cast(allocator.get())->maxRootDeviceIndex, 3u); +} +HWTEST_F(CommandStreamReceiverHwTest, givenMultiRootDeviceSyncNodeWhenFlushBcsTAskThenMiFlushAdded) { + using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW; + auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); + auto mockTagAllocator = std::make_unique>(pDevice->getRootDeviceIndex(), pDevice->getExecutionEnvironment()->memoryManager.get(), 10u); + + auto blitProperties = BlitProperties::constructPropertiesForReadWrite(BlitterConstants::BlitDirection::BufferToHostPtr, + commandStreamReceiver, commandStreamReceiver.getTagAllocation(), nullptr, + commandStreamReceiver.getTagAllocation()->getUnderlyingBuffer(), + commandStreamReceiver.getTagAllocation()->getGpuAddress(), 0, + 0, 0, 0, 0, 0, 0, 0); + auto tag = mockTagAllocator->getTag(); + blitProperties.multiRootDeviceEventSync = tag; + + BlitPropertiesContainer container; + container.push_back(blitProperties); + commandStreamReceiver.flushBcsTask(container, true, false, *pDevice); + HardwareParse hwParser; + hwParser.parseCommands(commandStreamReceiver.commandStream, 0); + + auto cmdIterator = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); + bool nodeAddressFound = false; + while (cmdIterator != hwParser.cmdList.end()) { + auto flush = genCmdCast(*cmdIterator); + if (flush->getDestinationAddress() == tag->getGpuAddress() + tag->getContextEndOffset()) { + nodeAddressFound = true; + break; + } + cmdIterator = find(++cmdIterator, hwParser.cmdList.end()); + } + EXPECT_TRUE(nodeAddressFound); +} +HWTEST_F(CommandStreamReceiverHwTest, givenNullPtrAsMultiRootDeviceSyncNodeWhenFlushBcsTAskThenMiFlushNotAdded) { + using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW; + auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); + auto mockTagAllocator = std::make_unique>(pDevice->getRootDeviceIndex(), pDevice->getExecutionEnvironment()->memoryManager.get(), 10u); + + auto blitProperties = BlitProperties::constructPropertiesForReadWrite(BlitterConstants::BlitDirection::BufferToHostPtr, + commandStreamReceiver, commandStreamReceiver.getTagAllocation(), nullptr, + commandStreamReceiver.getTagAllocation()->getUnderlyingBuffer(), + commandStreamReceiver.getTagAllocation()->getGpuAddress(), 0, + 0, 0, 0, 0, 0, 0, 0); + auto tag = mockTagAllocator->getTag(); + + BlitPropertiesContainer container; + container.push_back(blitProperties); + commandStreamReceiver.flushBcsTask(container, true, false, *pDevice); + HardwareParse hwParser; + hwParser.parseCommands(commandStreamReceiver.commandStream, 0); + + auto cmdIterator = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); + bool nodeAddressFound = false; + while (cmdIterator != hwParser.cmdList.end()) { + auto flush = genCmdCast(*cmdIterator); + if (flush->getDestinationAddress() == tag->getGpuAddress() + tag->getContextEndOffset()) { + nodeAddressFound = true; + break; + } + cmdIterator = find(++cmdIterator, hwParser.cmdList.end()); + } + EXPECT_FALSE(nodeAddressFound); +} \ No newline at end of file diff --git a/shared/test/unit_test/helpers/blit_commands_helper_tests.cpp b/shared/test/unit_test/helpers/blit_commands_helper_tests.cpp index c69b591a65..8bde71a82a 100644 --- a/shared/test/unit_test/helpers/blit_commands_helper_tests.cpp +++ b/shared/test/unit_test/helpers/blit_commands_helper_tests.cpp @@ -16,6 +16,7 @@ #include "shared/test/common/helpers/default_hw_info.h" #include "shared/test/common/mocks/mock_device.h" #include "shared/test/common/mocks/mock_graphics_allocation.h" +#include "shared/test/common/mocks/mock_timestamp_container.h" #include "shared/test/common/mocks/ult_device_factory.h" #include "shared/test/common/test_macros/test_checks_shared.h" @@ -664,3 +665,25 @@ HWTEST2_F(BlitTests, givenPlatformWhenCallingDispatchPreBlitCommandThenNoneMiFlu auto cmdIterator = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); ASSERT_EQ(hwParser.cmdList.end(), cmdIterator); } + +HWTEST_F(BlitTests, givenPlatformWhenCallingDispatchPreBlitCommandThenNoneMiFlushDwIsProgramed) { + auto mockTagAllocator = std::make_unique>(pDevice->getRootDeviceIndex(), pDevice->getExecutionEnvironment()->memoryManager.get(), 10u); + auto tag = mockTagAllocator->getTag(); + BlitProperties blitProperties{}; + blitProperties.copySize = {1, 1, 1}; + BlitPropertiesContainer blitPropertiesContainer1; + blitPropertiesContainer1.push_back(blitProperties); + blitPropertiesContainer1.push_back(blitProperties); + blitPropertiesContainer1.push_back(blitProperties); + + auto estimatedSizeWithoutNode = BlitCommandsHelper::estimateBlitCommandsSize( + blitPropertiesContainer1, false, true, false, pDevice->getRootDeviceEnvironment()); + blitProperties.multiRootDeviceEventSync = tag; + BlitPropertiesContainer blitPropertiesContainer2; + blitPropertiesContainer2.push_back(blitProperties); + blitPropertiesContainer2.push_back(blitProperties); + blitPropertiesContainer2.push_back(blitProperties); + auto estimatedSizeWithNode = BlitCommandsHelper::estimateBlitCommandsSize( + blitPropertiesContainer2, false, true, false, pDevice->getRootDeviceEnvironment()); + EXPECT_NE(estimatedSizeWithoutNode, estimatedSizeWithNode); +} \ No newline at end of file diff --git a/shared/test/unit_test/helpers/timestamp_packet_tests.cpp b/shared/test/unit_test/helpers/timestamp_packet_tests.cpp index 8ea1f939fa..f3f4e14904 100644 --- a/shared/test/unit_test/helpers/timestamp_packet_tests.cpp +++ b/shared/test/unit_test/helpers/timestamp_packet_tests.cpp @@ -303,3 +303,35 @@ HWTEST_F(DeviceTimestampPacketTests, givenDebugFlagSetWhenCreatingTimestampPacke EXPECT_FALSE(tag->canBeReleased()); } + +using TimestampPacketHelperTests = Test; + +HWTEST_F(TimestampPacketHelperTests, givenTagNodesInMultiRootSyncContainerWhenProgramingDependensiecThenSemaforesAreProgrammed) { + StackVec buffer(4096); + LinearStream cmdStream(buffer.begin(), buffer.size()); + CsrDependencies deps; + auto mockTagAllocator = std::make_unique>(0, pDevice->getMemoryManager()); + TimestampPacketContainer container = {}; + container.add(mockTagAllocator->getTag()); + deps.multiRootTimeStampSyncContainer.push_back(&container); + TimestampPacketHelper::programCsrDependenciesForForMultiRootDeviceSyncContainer(cmdStream, deps); + EXPECT_EQ(cmdStream.getUsed(), sizeof(typename FamilyType::MI_SEMAPHORE_WAIT)); +} + +HWTEST_F(TimestampPacketHelperTests, givenEmptyContainerMultiRootSyncContainerWhenProgramingDependensiecThenZeroSemaforesAreProgrammed) { + StackVec buffer(4096); + LinearStream cmdStream(buffer.begin(), buffer.size()); + CsrDependencies deps; + TimestampPacketContainer container = {}; + deps.multiRootTimeStampSyncContainer.push_back(&container); + TimestampPacketHelper::programCsrDependenciesForForMultiRootDeviceSyncContainer(cmdStream, deps); + EXPECT_EQ(cmdStream.getUsed(), 0u); +} + +HWTEST_F(TimestampPacketHelperTests, givenEmptyMultiRootSyncContainerWhenProgramingDependensiecThenZeroSemaforesAreProgrammed) { + StackVec buffer(4096); + LinearStream cmdStream(buffer.begin(), buffer.size()); + CsrDependencies deps; + TimestampPacketHelper::programCsrDependenciesForForMultiRootDeviceSyncContainer(cmdStream, deps); + EXPECT_EQ(cmdStream.getUsed(), 0u); +} \ No newline at end of file