From 16bc84e27d2044bb8ed9dacc26cc3c51537a2b10 Mon Sep 17 00:00:00 2001 From: Maciej Plewka Date: Thu, 12 Jan 2023 09:59:50 +0000 Subject: [PATCH] feature(ocl) use tags to synchronize multi root device events Signed-off-by: Maciej Plewka --- .../source/command_queue/command_queue_hw.h | 9 +- .../command_queue_hw_xehp_and_later.inl | 4 +- opencl/source/command_queue/enqueue_common.h | 55 +++++- opencl/source/command_queue/gpgpu_walker.h | 6 +- .../command_queue/gpgpu_walker_base.inl | 14 +- .../source/command_queue/hardware_interface.h | 3 +- .../command_queue/hardware_interface_base.inl | 3 +- opencl/source/context/context.cpp | 14 +- opencl/source/context/context.h | 6 + opencl/source/event/event.cpp | 20 +- opencl/source/event/event.h | 4 + opencl/source/gen11/command_queue_gen11.cpp | 2 +- .../source/gen12lp/command_queue_gen12lp.cpp | 2 +- opencl/source/gen8/command_queue_gen8.cpp | 2 +- opencl/source/gen9/command_queue_gen9.cpp | 2 +- opencl/source/helpers/properties_helper.cpp | 15 +- opencl/source/helpers/properties_helper.h | 2 +- opencl/source/helpers/task_information.cpp | 14 +- opencl/source/helpers/task_information.h | 3 +- .../xe_hp_core/command_queue_xe_hp_core.cpp | 2 +- .../xe_hpc_core/command_queue_xe_hpc_core.cpp | 2 +- .../xe_hpg_core/command_queue_xe_hpg_core.cpp | 2 +- .../command_queue_hw_1_tests.cpp | 4 +- .../command_queue_hw_2_tests.cpp | 3 +- .../command_queue/dispatch_walker_tests.cpp | 1 + .../dispatch_walker_tests_xehp_and_later.cpp | 10 +- .../enqueue_command_without_kernel_tests.cpp | 10 +- ...and_without_kernel_tests_dg2_and_later.cpp | 2 +- .../command_queue/enqueue_kernel_1_tests.cpp | 4 +- .../command_queue/enqueue_kernel_2_tests.cpp | 12 +- .../get_size_required_buffer_tests.cpp | 77 +++++++- ...and_stream_receiver_flush_task_3_tests.cpp | 4 +- ...and_stream_receiver_flush_task_4_tests.cpp | 181 +++++++++++++----- .../command_stream_receiver_hw_1_tests.cpp | 1 + .../command_stream_receiver_hw_2_tests.cpp | 2 +- .../unit_test/event/event_builder_tests.cpp | 6 +- opencl/test/unit_test/event/event_tests.cpp | 46 ++++- opencl/test/unit_test/gtpin/gtpin_tests.cpp | 2 +- .../helpers/task_information_tests.cpp | 10 +- .../helpers/timestamp_packet_1_tests.cpp | 14 +- .../kernel_cache_flush_requirements_tests.cpp | 6 +- opencl/test/unit_test/kernel/kernel_tests.cpp | 1 + .../unit_test/mem_obj/buffer_bcs_tests.cpp | 6 +- .../test/unit_test/mocks/mock_command_queue.h | 1 + opencl/test/unit_test/mocks/mock_event.h | 3 +- .../unit_test/profiling/profiling_tests.cpp | 18 +- .../command_stream_receiver_hw_tests_pvc.cpp | 20 +- .../command_stream/command_stream_receiver.h | 1 + .../command_stream_receiver_hw.h | 1 + .../command_stream_receiver_hw_base.inl | 18 +- shared/source/command_stream/csr_deps.h | 4 +- .../source/helpers/blit_commands_helper.cpp | 6 +- shared/source/helpers/blit_commands_helper.h | 3 +- .../helpers/blit_commands_helper_base.inl | 5 +- shared/source/helpers/timestamp_packet.h | 20 +- .../mocks/mock_command_stream_receiver.h | 2 + .../command_stream_receiver_tests.cpp | 90 +++++++++ .../helpers/blit_commands_helper_tests.cpp | 23 +++ .../helpers/timestamp_packet_tests.cpp | 34 +++- 59 files changed, 644 insertions(+), 193 deletions(-) diff --git a/opencl/source/command_queue/command_queue_hw.h b/opencl/source/command_queue/command_queue_hw.h index 5679c2f616..8fbadd5f80 100644 --- a/opencl/source/command_queue/command_queue_hw.h +++ b/opencl/source/command_queue/command_queue_hw.h @@ -391,7 +391,8 @@ class CommandQueueHw : public CommandQueue { EventsRequest &eventsRequest, EventBuilder &externalEventBuilder, std::unique_ptr &&printfHandler, - CommandStreamReceiver *bcsCsr); + CommandStreamReceiver *bcsCsr, + TagNodeBase *multiRootDeviceSyncNode); CompletionStamp enqueueCommandWithoutKernel(Surface **surfaces, size_t surfaceCount, @@ -422,7 +423,7 @@ class CommandQueueHw : public CommandQueue { TimestampPacketDependencies ×tampPacketDependencies, const EventsRequest &eventsRequest, LinearStream *commandStream, - uint32_t commandType, bool queueBlocked); + uint32_t commandType, bool queueBlocked, TagNodeBase *multiRootDeviceEventSync); void submitCacheFlush(Surface **surfaces, size_t numSurfaces, LinearStream *commandStream, @@ -433,6 +434,8 @@ class CommandQueueHw : public CommandQueue { bool waitForTimestamps(Range copyEnginesToWait, TaskCountType taskCount, WaitStatus &status, TimestampPacketContainer *mainContainer, TimestampPacketContainer *deferredContainer) override; MOCKABLE_VIRTUAL bool isCacheFlushForBcsRequired() const; + void processSignalMultiRootDeviceNode(LinearStream *commandStream, + TagNodeBase *node); protected: MOCKABLE_VIRTUAL void enqueueHandlerHook(const unsigned int commandType, const MultiDispatchInfo &dispatchInfo){}; @@ -473,7 +476,7 @@ class CommandQueueHw : public CommandQueue { blockedCommandsData = std::make_unique(commandStream, *gpgpuCsr.getInternalAllocationStorage()); } else { commandStream = &getCommandStream(*this, csrDependencies, profilingRequired, perfCountersRequired, - blitEnqueue, multiDispatchInfo, surfaces, numSurfaces, isMarkerWithProfiling, eventsRequest.numEventsInWaitList > 0); + blitEnqueue, multiDispatchInfo, surfaces, numSurfaces, isMarkerWithProfiling, eventsRequest.numEventsInWaitList > 0, eventsRequest.outEvent); } return commandStream; } diff --git a/opencl/source/command_queue/command_queue_hw_xehp_and_later.inl b/opencl/source/command_queue/command_queue_hw_xehp_and_later.inl index 33b2cadafa..27b00dd488 100644 --- a/opencl/source/command_queue/command_queue_hw_xehp_and_later.inl +++ b/opencl/source/command_queue/command_queue_hw_xehp_and_later.inl @@ -1,5 +1,5 @@ /* - * Copyright (C) 2021-2022 Intel Corporation + * Copyright (C) 2021-2023 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -45,7 +45,7 @@ bool CommandQueueHw::isCacheFlushCommand(uint32_t commandType) const { } template <> -LinearStream &getCommandStream(CommandQueue &commandQueue, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, bool blitEnqueue, const MultiDispatchInfo &multiDispatchInfo, Surface **surfaces, size_t numSurfaces, bool isMarkerWithProfiling, bool eventsInWaitList) { +LinearStream &getCommandStream(CommandQueue &commandQueue, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, bool blitEnqueue, const MultiDispatchInfo &multiDispatchInfo, Surface **surfaces, size_t numSurfaces, bool isMarkerWithProfiling, bool eventsInWaitList, cl_event *outEvent) { size_t expectedSizeCS = 0; [[maybe_unused]] bool usePostSync = false; if (commandQueue.getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h index af61df192b..633eaaec34 100644 --- a/opencl/source/command_queue/enqueue_common.h +++ b/opencl/source/command_queue/enqueue_common.h @@ -177,7 +177,7 @@ cl_int CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, BlitPropertiesContainer blitPropertiesContainer; if (this->context->getRootDeviceIndices().size() > 1) { - eventsRequest.fillCsrDependenciesForTaskCountContainer(csrDeps, computeCommandStreamReceiver); + eventsRequest.fillCsrDependenciesForRootDevices(csrDeps, computeCommandStreamReceiver); } const bool enqueueWithBlitAuxTranslation = isBlitAuxTranslationRequired(multiDispatchInfo); @@ -226,7 +226,7 @@ cl_int CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, } if (this->context->getRootDeviceIndices().size() > 1) { - TimestampPacketHelper::programCsrDependenciesForForTaskCountContainer(commandStream, csrDeps); + TimestampPacketHelper::programCsrDependenciesForForMultiRootDeviceSyncContainer(commandStream, csrDeps); } if (enqueueWithBlitAuxTranslation) { @@ -280,6 +280,17 @@ cl_int CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, } else if (isMarkerWithPostSyncWrite) { processDispatchForMarker(*this, &commandStream, eventsRequest, csrDeps); } + TagNodeBase *multiRootEventSyncStamp = nullptr; + if (eventBuilder.getEvent() && eventBuilder.getEvent()->getContext()->getRootDeviceIndices().size() > 1) { + multiRootEventSyncStamp = eventBuilder.getEvent()->getMultiRootTimestampSyncNode(); + if (!blockQueue) { + this->getGpgpuCommandStreamReceiver().makeResident(*multiRootEventSyncStamp->getBaseGraphicsAllocation()); + } + processSignalMultiRootDeviceNode(&commandStream, multiRootEventSyncStamp); + if (CL_COMMAND_MARKER == commandType) { + flushDependenciesForNonKernelCommand = true; + } + } CompletionStamp completionStamp = {CompletionStamp::notReady, taskLevel, 0}; const EnqueueProperties enqueueProperties(false, !multiDispatchInfo.empty(), isCacheFlushCommand(commandType), @@ -382,7 +393,8 @@ cl_int CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, eventsRequest, eventBuilder, std::move(printfHandler), - nullptr); + nullptr, + multiRootEventSyncStamp); } if (deferredTimestampPackets.get()) { @@ -497,7 +509,7 @@ BlitProperties CommandQueueHw::processDispatchForBlitEnqueue(CommandS const MultiDispatchInfo &multiDispatchInfo, TimestampPacketDependencies ×tampPacketDependencies, const EventsRequest &eventsRequest, LinearStream *commandStream, - uint32_t commandType, bool queueBlocked) { + uint32_t commandType, bool queueBlocked, TagNodeBase *multiRootDeviceEventSync) { auto blitDirection = ClBlitProperties::obtainBlitDirection(commandType); auto blitProperties = ClBlitProperties::constructProperties(blitDirection, blitCommandStreamReceiver, @@ -510,7 +522,7 @@ BlitProperties CommandQueueHw::processDispatchForBlitEnqueue(CommandS blitProperties.csrDependencies.timestampPacketContainer.push_back(×tampPacketDependencies.previousEnqueueNodes); blitProperties.csrDependencies.timestampPacketContainer.push_back(×tampPacketDependencies.barrierNodes); } - + blitProperties.multiRootDeviceEventSync = multiRootDeviceEventSync; auto currentTimestampPacketNode = timestampPacketContainer->peekNodes().at(0); blitProperties.outputTimestampPacket = currentTimestampPacketNode; @@ -616,7 +628,20 @@ void CommandQueueHw::processDispatchForMarker(CommandQueue &commandQu HardwareInterface::dispatchProfilingPerfEndCommands(hwTimeStamps, hwPerfCounter, commandStream, commandQueue); getGpgpuCommandStreamReceiver().makeResident(*hwTimeStamps->getBaseGraphicsAllocation()); } - +template +void CommandQueueHw::processSignalMultiRootDeviceNode(LinearStream *commandStream, + TagNodeBase *node) { + const auto &hwInfo = getDevice().getHardwareInfo(); + PipeControlArgs args; + args.dcFlushEnable = MemorySynchronizationCommands::getDcFlushEnable(true, hwInfo); + MemorySynchronizationCommands::addBarrierWithPostSyncOperation( + *commandStream, + PostSyncMode::ImmediateData, + node->getGpuAddress() + node->getContextEndOffset(), + std::numeric_limits::max(), + hwInfo, + args); +} template void CommandQueueHw::processDispatchForMarkerWithTimestampPacket(CommandQueue &commandQueue, LinearStream *commandStream, @@ -901,7 +926,8 @@ void CommandQueueHw::enqueueBlocked( EventsRequest &eventsRequest, EventBuilder &externalEventBuilder, std::unique_ptr &&printfHandler, - CommandStreamReceiver *bcsCsr) { + CommandStreamReceiver *bcsCsr, + TagNodeBase *multiRootDeviceSyncNode) { TakeOwnershipWrapper> queueOwnership(*this); @@ -972,7 +998,8 @@ void CommandQueueHw::enqueueBlocked( std::move(printfHandler), preemptionMode, multiDispatchInfo.peekMainKernel(), - (uint32_t)multiDispatchInfo.size()); + (uint32_t)multiDispatchInfo.size(), + multiRootDeviceSyncNode); } if (storeTimestampPackets) { command->setTimestampPacketNode(*timestampPacketContainer, std::move(timestampPacketDependencies)); @@ -1274,10 +1301,14 @@ cl_int CommandQueueHw::enqueueBlit(const MultiDispatchInfo &multiDisp } TimestampPacketDependencies timestampPacketDependencies; + TagNodeBase *multiRootEventSyncStamp = nullptr; BlitPropertiesContainer blitPropertiesContainer; CsrDependencies csrDeps; eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, bcsCsr, CsrDependencies::DependenciesType::All); + if (this->context->getRootDeviceIndices().size() > 1) { + eventsRequest.fillCsrDependenciesForRootDevices(csrDeps, bcsCsr); + } auto allocator = bcsCsr.getTimestampPacketAllocator(); if (!blockQueue) { @@ -1304,6 +1335,10 @@ cl_int CommandQueueHw::enqueueBlit(const MultiDispatchInfo &multiDisp if (eventBuilder.getEvent()) { eventBuilder.getEvent()->addTimestampPacketNodes(*timestampPacketContainer); } + if (eventBuilder.getEvent() && eventBuilder.getEvent()->getContext()->getRootDeviceIndices().size() > 1) { + multiRootEventSyncStamp = eventBuilder.getEvent()->getMultiRootTimestampSyncNode(); + bcsCsr.makeResident(*multiRootEventSyncStamp->getBaseGraphicsAllocation()); + } CompletionStamp completionStamp = {CompletionStamp::notReady, taskLevel, 0}; @@ -1320,7 +1355,7 @@ cl_int CommandQueueHw::enqueueBlit(const MultiDispatchInfo &multiDisp } blitPropertiesContainer.push_back(processDispatchForBlitEnqueue(bcsCsr, multiDispatchInfo, timestampPacketDependencies, - eventsRequest, gpgpuCommandStream, cmdType, blockQueue)); + eventsRequest, gpgpuCommandStream, cmdType, blockQueue, multiRootEventSyncStamp)); if (!blockQueue) { completionStamp = enqueueCommandWithoutKernel(nullptr, 0, gpgpuCommandStream, gpgpuCommandStreamStart, blocking, @@ -1347,7 +1382,7 @@ cl_int CommandQueueHw::enqueueBlit(const MultiDispatchInfo &multiDisp updateFromCompletionStamp(completionStamp, eventBuilder.getEvent()); if (blockQueue) { - enqueueBlocked(cmdType, nullptr, 0, multiDispatchInfo, timestampPacketDependencies, blockedCommandsData, enqueueProperties, eventsRequest, eventBuilder, nullptr, &bcsCsr); + enqueueBlocked(cmdType, nullptr, 0, multiDispatchInfo, timestampPacketDependencies, blockedCommandsData, enqueueProperties, eventsRequest, eventBuilder, nullptr, &bcsCsr, multiRootEventSyncStamp); if (gpgpuSubmission) { if (DebugManager.flags.ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission.get() == 1) { diff --git a/opencl/source/command_queue/gpgpu_walker.h b/opencl/source/command_queue/gpgpu_walker.h index 373b3f106c..8f18cf4d3a 100644 --- a/opencl/source/command_queue/gpgpu_walker.h +++ b/opencl/source/command_queue/gpgpu_walker.h @@ -88,7 +88,7 @@ class GpgpuWalkerHelper { template struct EnqueueOperation { using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL; - static size_t getTotalSizeRequiredCS(uint32_t eventType, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace, bool reservePerfCounters, bool blitEnqueue, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo, bool isMarkerWithProfiling, bool eventsInWaitList); + static size_t getTotalSizeRequiredCS(uint32_t eventType, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace, bool reservePerfCounters, bool blitEnqueue, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo, bool isMarkerWithProfiling, bool eventsInWaitList, cl_event *outEvent); static size_t getSizeRequiredCS(uint32_t cmdType, bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel, const DispatchInfo &dispatchInfo); static size_t getSizeRequiredForTimestampPacketWrite(); static size_t getSizeForCacheFlushAfterWalkerCommands(const Kernel &kernel, const CommandQueue &commandQueue); @@ -101,8 +101,8 @@ struct EnqueueOperation { template LinearStream &getCommandStream(CommandQueue &commandQueue, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, bool blitEnqueue, const MultiDispatchInfo &multiDispatchInfo, - Surface **surfaces, size_t numSurfaces, bool isMarkerWithProfiling, bool eventsInWaitList) { - size_t expectedSizeCS = EnqueueOperation::getTotalSizeRequiredCS(eventType, csrDeps, reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, blitEnqueue, commandQueue, multiDispatchInfo, isMarkerWithProfiling, eventsInWaitList); + Surface **surfaces, size_t numSurfaces, bool isMarkerWithProfiling, bool eventsInWaitList, cl_event *outEvent) { + size_t expectedSizeCS = EnqueueOperation::getTotalSizeRequiredCS(eventType, csrDeps, reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, blitEnqueue, commandQueue, multiDispatchInfo, isMarkerWithProfiling, eventsInWaitList, outEvent); return commandQueue.getCS(expectedSizeCS); } diff --git a/opencl/source/command_queue/gpgpu_walker_base.inl b/opencl/source/command_queue/gpgpu_walker_base.inl index 02df536e6a..1bfc233caf 100644 --- a/opencl/source/command_queue/gpgpu_walker_base.inl +++ b/opencl/source/command_queue/gpgpu_walker_base.inl @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2022 Intel Corporation + * Copyright (C) 2018-2023 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -165,7 +165,7 @@ size_t GpgpuWalkerHelper::getSizeForWaDisableRccRhwoOptimization(cons } template -size_t EnqueueOperation::getTotalSizeRequiredCS(uint32_t eventType, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace, bool reservePerfCounters, bool blitEnqueue, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo, bool isMarkerWithProfiling, bool eventsInWaitlist) { +size_t EnqueueOperation::getTotalSizeRequiredCS(uint32_t eventType, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace, bool reservePerfCounters, bool blitEnqueue, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo, bool isMarkerWithProfiling, bool eventsInWaitlist, cl_event *outEvent) { size_t expectedSizeCS = 0; auto &hwInfo = commandQueue.getDevice().getHardwareInfo(); auto &gfxCoreHelper = commandQueue.getDevice().getGfxCoreHelper(); @@ -218,8 +218,14 @@ size_t EnqueueOperation::getTotalSizeRequiredCS(uint32_t eventType, c if (DebugManager.flags.GpuScratchRegWriteAfterWalker.get() != -1) { expectedSizeCS += sizeof(typename GfxFamily::MI_LOAD_REGISTER_IMM); } - - expectedSizeCS += TimestampPacketHelper::getRequiredCmdStreamSizeForTaskCountContainer(csrDeps); + expectedSizeCS += TimestampPacketHelper::getRequiredCmdStreamSizeForMultiRootDeviceSyncNodesContainer(csrDeps); + if (outEvent) { + auto pEvent = castToObjectOrAbort(*outEvent); + if ((pEvent->getContext()->getRootDeviceIndices().size() > 1) && (!pEvent->isUserEvent())) { + expectedSizeCS += MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hwInfo, false); + } + } + expectedSizeCS += MemorySynchronizationCommands::getSizeForSingleBarrier(false); return expectedSizeCS; } diff --git a/opencl/source/command_queue/hardware_interface.h b/opencl/source/command_queue/hardware_interface.h index a88e1c6d4f..021b02cb79 100644 --- a/opencl/source/command_queue/hardware_interface.h +++ b/opencl/source/command_queue/hardware_interface.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2022 Intel Corporation + * Copyright (C) 2018-2023 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -31,6 +31,7 @@ struct HardwareInterfaceWalkerArgs { size_t localWorkSizes[3] = {}; TagNodeBase *hwTimeStamps = nullptr; TagNodeBase *hwPerfCounter = nullptr; + TagNodeBase *multiRootDeviceEventStamp = nullptr; TimestampPacketDependencies *timestampPacketDependencies = nullptr; TimestampPacketContainer *currentTimestampPacketNodes = nullptr; const Vec3 *numberOfWorkgroups = nullptr; diff --git a/opencl/source/command_queue/hardware_interface_base.inl b/opencl/source/command_queue/hardware_interface_base.inl index a514157406..f1f1d7cf63 100644 --- a/opencl/source/command_queue/hardware_interface_base.inl +++ b/opencl/source/command_queue/hardware_interface_base.inl @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2022 Intel Corporation + * Copyright (C) 2018-2023 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -133,6 +133,7 @@ void HardwareInterface::dispatchWalker( walkerArgs.currentTimestampPacketNodes); walkerArgs.currentDispatchIndex = 0; + for (auto &dispatchInfo : multiDispatchInfo) { dispatchInfo.dispatchInitCommands(*commandStream, walkerArgs.timestampPacketDependencies, commandQueue.getDevice().getHardwareInfo()); walkerArgs.isMainKernel = (dispatchInfo.getKernel() == mainKernel); diff --git a/opencl/source/context/context.cpp b/opencl/source/context/context.cpp index 8bf82c19cd..0f089fe97f 100644 --- a/opencl/source/context/context.cpp +++ b/opencl/source/context/context.cpp @@ -49,7 +49,9 @@ Context::Context( Context::~Context() { gtpinNotifyContextDestroy((cl_context)this); - + if (multiRootDeviceTimestampPacketAllocator.get() != nullptr) { + multiRootDeviceTimestampPacketAllocator.reset(); + } if (smallBufferPoolAllocator.isAggregatedSmallBuffersEnabled(this)) { smallBufferPoolAllocator.releaseSmallBufferPool(); } @@ -558,5 +560,15 @@ void Context::BufferPoolAllocator::releaseSmallBufferPool() { delete this->mainStorage; this->mainStorage = nullptr; } +TagAllocatorBase *Context::getMultiRootDeviceTimestampPacketAllocator() { + return multiRootDeviceTimestampPacketAllocator.get(); +} +void Context::setMultiRootDeviceTimestampPacketAllocator(std::unique_ptr &allocator) { + multiRootDeviceTimestampPacketAllocator = std::move(allocator); +} + +std::unique_lock Context::obtainOwnershipForMultiRootDeviceAllocator() { + return std::unique_lock(multiRootDeviceAllocatorMtx); +} } // namespace NEO diff --git a/opencl/source/context/context.h b/opencl/source/context/context.h index f59630cd4f..33d7e0dbbb 100644 --- a/opencl/source/context/context.h +++ b/opencl/source/context/context.h @@ -37,6 +37,7 @@ class SharingFunctions; class SVMAllocsManager; class Program; class Platform; +class TagAllocatorBase; template <> struct OpenCLObjectMapper<_cl_context> { @@ -223,6 +224,9 @@ class Context : public BaseObject<_cl_context> { BufferPoolAllocator &getBufferPoolAllocator() { return this->smallBufferPoolAllocator; } + TagAllocatorBase *getMultiRootDeviceTimestampPacketAllocator(); + std::unique_lock obtainOwnershipForMultiRootDeviceAllocator(); + void setMultiRootDeviceTimestampPacketAllocator(std::unique_ptr &allocator); protected: struct BuiltInKernel { @@ -263,6 +267,8 @@ class Context : public BaseObject<_cl_context> { uint32_t maxRootDeviceIndex = std::numeric_limits::max(); cl_bool preferD3dSharedResources = 0u; ContextType contextType = ContextType::CONTEXT_TYPE_DEFAULT; + std::unique_ptr multiRootDeviceTimestampPacketAllocator; + std::mutex multiRootDeviceAllocatorMtx; bool interopUserSync = false; bool resolvesRequiredInKernels = false; diff --git a/opencl/source/event/event.cpp b/opencl/source/event/event.cpp index e1752f9774..27953f1cc8 100644 --- a/opencl/source/event/event.cpp +++ b/opencl/source/event/event.cpp @@ -131,6 +131,9 @@ Event::~Event() { if (timeStampNode != nullptr) { timeStampNode->returnTag(); } + if (multiRootTimeStampSyncNode != nullptr) { + multiRootTimeStampSyncNode->returnTag(); + } if (perfCounterNode != nullptr) { cmdQueue->getPerfCounters()->deleteQuery(perfCounterNode->getQueryHandleRef()); perfCounterNode->getQueryHandleRef() = {}; @@ -883,7 +886,6 @@ TagNodeBase *Event::getHwTimeStampNode() { } TagNodeBase *Event::getHwPerfCounterNode() { - if (!perfCounterNode && cmdQueue->getPerfCounters()) { const uint32_t gpuReportSize = HwPerfCounter::getSize(*(cmdQueue->getPerfCounters())); perfCounterNode = cmdQueue->getGpgpuCommandStreamReceiver().getEventPerfCountAllocator(gpuReportSize)->getTag(); @@ -891,11 +893,27 @@ TagNodeBase *Event::getHwPerfCounterNode() { return perfCounterNode; } +TagNodeBase *Event::getMultiRootTimestampSyncNode() { + auto lock = getContext()->obtainOwnershipForMultiRootDeviceAllocator(); + if (getContext()->getMultiRootDeviceTimestampPacketAllocator() == nullptr) { + auto allocator = cmdQueue->getGpgpuCommandStreamReceiver().createMultiRootDeviceTimestampPacketAllocator(getContext()->getRootDeviceIndices()); + getContext()->setMultiRootDeviceTimestampPacketAllocator(allocator); + } + lock.unlock(); + if (multiRootDeviceTimestampPacketContainer.get() == nullptr) { + multiRootDeviceTimestampPacketContainer = std::make_unique(); + } + multiRootTimeStampSyncNode = getContext()->getMultiRootDeviceTimestampPacketAllocator()->getTag(); + multiRootDeviceTimestampPacketContainer->add(multiRootTimeStampSyncNode); + return multiRootTimeStampSyncNode; +} + void Event::addTimestampPacketNodes(const TimestampPacketContainer &inputTimestampPacketContainer) { timestampPacketContainer->assignAndIncrementNodesRefCounts(inputTimestampPacketContainer); } TimestampPacketContainer *Event::getTimestampPacketNodes() const { return timestampPacketContainer.get(); } +TimestampPacketContainer *Event::getMultiRootDeviceTimestampPacketNodes() const { return multiRootDeviceTimestampPacketContainer.get(); } bool Event::checkUserEventDependencies(cl_uint numEventsInWaitList, const cl_event *eventWaitList) { bool userEventsDependencies = false; diff --git a/opencl/source/event/event.h b/opencl/source/event/event.h index a502fe415d..dd7ed18471 100644 --- a/opencl/source/event/event.h +++ b/opencl/source/event/event.h @@ -115,6 +115,7 @@ class Event : public BaseObject<_cl_event>, public IDNode { void addTimestampPacketNodes(const TimestampPacketContainer &inputTimestampPacketContainer); TimestampPacketContainer *getTimestampPacketNodes() const; + TimestampPacketContainer *getMultiRootDeviceTimestampPacketNodes() const; bool isPerfCountersEnabled() const { return perfCountersEnabled; @@ -129,6 +130,7 @@ class Event : public BaseObject<_cl_event>, public IDNode { } TagNodeBase *getHwPerfCounterNode(); + TagNodeBase *getMultiRootTimestampSyncNode(); std::unique_ptr flushStamp; std::atomic taskLevel; @@ -384,8 +386,10 @@ class Event : public BaseObject<_cl_event>, public IDNode { bool perfCountersEnabled; TagNodeBase *timeStampNode = nullptr; TagNodeBase *perfCounterNode = nullptr; + TagNodeBase *multiRootTimeStampSyncNode = nullptr; std::unique_ptr timestampPacketContainer; // number of events this event depends on + std::unique_ptr multiRootDeviceTimestampPacketContainer; std::atomic parentCount; // event parents std::vector parentEvents; diff --git a/opencl/source/gen11/command_queue_gen11.cpp b/opencl/source/gen11/command_queue_gen11.cpp index c1efcf0cea..f06950f3c8 100644 --- a/opencl/source/gen11/command_queue_gen11.cpp +++ b/opencl/source/gen11/command_queue_gen11.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2019-2022 Intel Corporation + * Copyright (C) 2019-2023 Intel Corporation * * SPDX-License-Identifier: MIT * diff --git a/opencl/source/gen12lp/command_queue_gen12lp.cpp b/opencl/source/gen12lp/command_queue_gen12lp.cpp index 7ad2d1f9e9..e6080ff4cb 100644 --- a/opencl/source/gen12lp/command_queue_gen12lp.cpp +++ b/opencl/source/gen12lp/command_queue_gen12lp.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2019-2022 Intel Corporation + * Copyright (C) 2019-2023 Intel Corporation * * SPDX-License-Identifier: MIT * diff --git a/opencl/source/gen8/command_queue_gen8.cpp b/opencl/source/gen8/command_queue_gen8.cpp index dc3fcae08e..c419d9f5d6 100644 --- a/opencl/source/gen8/command_queue_gen8.cpp +++ b/opencl/source/gen8/command_queue_gen8.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2022 Intel Corporation + * Copyright (C) 2018-2023 Intel Corporation * * SPDX-License-Identifier: MIT * diff --git a/opencl/source/gen9/command_queue_gen9.cpp b/opencl/source/gen9/command_queue_gen9.cpp index 8dddd508a9..d010cc240d 100644 --- a/opencl/source/gen9/command_queue_gen9.cpp +++ b/opencl/source/gen9/command_queue_gen9.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2022 Intel Corporation + * Copyright (C) 2018-2023 Intel Corporation * * SPDX-License-Identifier: MIT * diff --git a/opencl/source/helpers/properties_helper.cpp b/opencl/source/helpers/properties_helper.cpp index 52dcaa6b1a..db1adee64d 100644 --- a/opencl/source/helpers/properties_helper.cpp +++ b/opencl/source/helpers/properties_helper.cpp @@ -26,7 +26,6 @@ namespace NEO { void flushDependentCsr(CommandStreamReceiver &dependentCsr, CsrDependencies &csrDeps) { auto csrOwnership = dependentCsr.obtainUniqueOwnership(); dependentCsr.updateTagFromWait(); - csrDeps.taskCountContainer.push_back({dependentCsr.peekTaskCount(), reinterpret_cast(dependentCsr.getTagAddress())}); } void EventsRequest::fillCsrDependenciesForTimestampPacketContainer(CsrDependencies &csrDeps, CommandStreamReceiver ¤tCsr, CsrDependencies::DependenciesType depsType) const { @@ -60,6 +59,7 @@ void EventsRequest::fillCsrDependenciesForTimestampPacketContainer(CsrDependenci if (productHelper.isDcFlushAllowed()) { if (!dependentCsr.isLatestTaskCountFlushed()) { flushDependentCsr(dependentCsr, csrDeps); + // csrDeps.taskCountContainer.push_back({dependentCsr.peekTaskCount(), reinterpret_cast(dependentCsr.getTagAddress())}); currentCsr.makeResident(*dependentCsr.getTagAllocation()); } } @@ -68,23 +68,22 @@ void EventsRequest::fillCsrDependenciesForTimestampPacketContainer(CsrDependenci } } -void EventsRequest::fillCsrDependenciesForTaskCountContainer(CsrDependencies &csrDeps, CommandStreamReceiver ¤tCsr) const { +void EventsRequest::fillCsrDependenciesForRootDevices(CsrDependencies &csrDeps, CommandStreamReceiver ¤tCsr) const { for (cl_uint i = 0; i < this->numEventsInWaitList; i++) { auto event = castToObjectOrAbort(this->eventWaitList[i]); if (event->isUserEvent() || CompletionStamp::notReady == event->peekTaskCount()) { continue; } - if (event->getCommandQueue() && event->getCommandQueue()->getDevice().getRootDeviceIndex() != currentCsr.getRootDeviceIndex()) { + auto timestampPacketContainer = event->getMultiRootDeviceTimestampPacketNodes(); + if (!timestampPacketContainer || timestampPacketContainer->peekNodes().empty()) { + continue; + } auto &dependentCsr = event->getCommandQueue()->getGpgpuCommandStreamReceiver(); if (!dependentCsr.isLatestTaskCountFlushed()) { flushDependentCsr(dependentCsr, csrDeps); - } else { - csrDeps.taskCountContainer.push_back({event->peekTaskCount(), reinterpret_cast(dependentCsr.getTagAddress())}); } - - auto graphicsAllocation = event->getCommandQueue()->getGpgpuCommandStreamReceiver().getTagsMultiAllocation()->getGraphicsAllocation(currentCsr.getRootDeviceIndex()); - currentCsr.getResidencyAllocations().push_back(graphicsAllocation); + csrDeps.multiRootTimeStampSyncContainer.push_back(timestampPacketContainer); } } } diff --git a/opencl/source/helpers/properties_helper.h b/opencl/source/helpers/properties_helper.h index 37b2c6564b..5a653b18e5 100644 --- a/opencl/source/helpers/properties_helper.h +++ b/opencl/source/helpers/properties_helper.h @@ -25,7 +25,7 @@ struct EventsRequest { : numEventsInWaitList(numEventsInWaitList), eventWaitList(eventWaitList), outEvent(outEvent) {} void fillCsrDependenciesForTimestampPacketContainer(CsrDependencies &csrDeps, CommandStreamReceiver ¤tCsr, CsrDependencies::DependenciesType depsType) const; - void fillCsrDependenciesForTaskCountContainer(CsrDependencies &csrDeps, CommandStreamReceiver ¤tCsr) const; + void fillCsrDependenciesForRootDevices(CsrDependencies &csrDeps, CommandStreamReceiver ¤tCsr) const; void setupBcsCsrForOutputEvent(CommandStreamReceiver &bcsCsr) const; cl_uint numEventsInWaitList; diff --git a/opencl/source/helpers/task_information.cpp b/opencl/source/helpers/task_information.cpp index 3fadb49030..5a7e823581 100644 --- a/opencl/source/helpers/task_information.cpp +++ b/opencl/source/helpers/task_information.cpp @@ -118,10 +118,11 @@ CompletionStamp &CommandMapUnmap::submit(TaskCountType taskLevel, bool terminate CommandComputeKernel::CommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr &kernelOperation, std::vector surfaces, bool flushDC, bool usesSLM, uint32_t commandType, std::unique_ptr &&printfHandler, - PreemptionMode preemptionMode, Kernel *kernel, uint32_t kernelCount) + PreemptionMode preemptionMode, Kernel *kernel, uint32_t kernelCount, + TagNodeBase *multiRootDeviceSyncNode) : Command(commandQueue, kernelOperation), surfaces(std::move(surfaces)), flushDC(flushDC), slmUsed(usesSLM), commandType(commandType), printfHandler(std::move(printfHandler)), kernel(kernel), - kernelCount(kernelCount), preemptionMode(preemptionMode) { + kernelCount(kernelCount), preemptionMode(preemptionMode), multiRootDeviceSyncNode(multiRootDeviceSyncNode) { UNRECOVERABLE_IF(nullptr == this->kernel); kernel->incRefInternal(); } @@ -163,6 +164,9 @@ CompletionStamp &CommandComputeKernel::submit(TaskCountType taskLevel, bool term printfHandler->makeResident(commandStreamReceiver); } makeTimestampPacketsResident(commandStreamReceiver); + if (multiRootDeviceSyncNode != nullptr) { + commandStreamReceiver.makeResident(*multiRootDeviceSyncNode->getBaseGraphicsAllocation()); + } if (kernelOperation->blitPropertiesContainer.size() > 0) { CsrDependencies csrDeps; @@ -214,7 +218,7 @@ CompletionStamp &CommandComputeKernel::submit(TaskCountType taskLevel, bool term false); // hasRelaxedOrderingDependencies if (commandQueue.getContext().getRootDeviceIndices().size() > 1) { - eventsRequest.fillCsrDependenciesForTaskCountContainer(dispatchFlags.csrDependencies, commandStreamReceiver); + eventsRequest.fillCsrDependenciesForRootDevices(dispatchFlags.csrDependencies, commandStreamReceiver); } const bool isHandlingBarrier = commandQueue.getGpgpuCommandStreamReceiver().isStallingCommandsOnNextFlushRequired(); @@ -307,7 +311,7 @@ TaskCountType CommandWithoutKernel::dispatchBlitOperation() { blitProperties.outputTimestampPacket = currentTimestampPacketNodes->peekNodes()[0]; if (commandQueue.getContext().getRootDeviceIndices().size() > 1) { - eventsRequest.fillCsrDependenciesForTaskCountContainer(blitProperties.csrDependencies, *bcsCsr); + eventsRequest.fillCsrDependenciesForRootDevices(blitProperties.csrDependencies, *bcsCsr); } const auto newTaskCount = bcsCsr->flushBcsTask(kernelOperation->blitPropertiesContainer, false, commandQueue.isProfilingEnabled(), commandQueue.getDevice()); @@ -389,7 +393,7 @@ CompletionStamp &CommandWithoutKernel::submit(TaskCountType taskLevel, bool term false); // hasRelaxedOrderingDependencies if (commandQueue.getContext().getRootDeviceIndices().size() > 1) { - eventsRequest.fillCsrDependenciesForTaskCountContainer(dispatchFlags.csrDependencies, commandStreamReceiver); + eventsRequest.fillCsrDependenciesForRootDevices(dispatchFlags.csrDependencies, commandStreamReceiver); } const bool isHandlingBarrier = commandQueue.getGpgpuCommandStreamReceiver().isStallingCommandsOnNextFlushRequired(); diff --git a/opencl/source/helpers/task_information.h b/opencl/source/helpers/task_information.h index 0eb3934b56..f7fb1c2aad 100644 --- a/opencl/source/helpers/task_information.h +++ b/opencl/source/helpers/task_information.h @@ -127,7 +127,7 @@ class CommandComputeKernel : public Command { public: CommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr &kernelOperation, std::vector surfaces, bool flushDC, bool usesSLM, uint32_t commandType, std::unique_ptr &&printfHandler, - PreemptionMode preemptionMode, Kernel *kernel, uint32_t kernelCount); + PreemptionMode preemptionMode, Kernel *kernel, uint32_t kernelCount, TagNodeBase *multiRootDeviceSyncNode); ~CommandComputeKernel() override; @@ -146,6 +146,7 @@ class CommandComputeKernel : public Command { Kernel *kernel; uint32_t kernelCount; PreemptionMode preemptionMode; + TagNodeBase *multiRootDeviceSyncNode; }; class CommandWithoutKernel : public Command { diff --git a/opencl/source/xe_hp_core/command_queue_xe_hp_core.cpp b/opencl/source/xe_hp_core/command_queue_xe_hp_core.cpp index 7c502707e5..22f07d26b5 100644 --- a/opencl/source/xe_hp_core/command_queue_xe_hp_core.cpp +++ b/opencl/source/xe_hp_core/command_queue_xe_hp_core.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2021-2022 Intel Corporation + * Copyright (C) 2021-2023 Intel Corporation * * SPDX-License-Identifier: MIT * diff --git a/opencl/source/xe_hpc_core/command_queue_xe_hpc_core.cpp b/opencl/source/xe_hpc_core/command_queue_xe_hpc_core.cpp index 56cc5c5a63..3536f0329c 100644 --- a/opencl/source/xe_hpc_core/command_queue_xe_hpc_core.cpp +++ b/opencl/source/xe_hpc_core/command_queue_xe_hpc_core.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2021-2022 Intel Corporation + * Copyright (C) 2021-2023 Intel Corporation * * SPDX-License-Identifier: MIT * diff --git a/opencl/source/xe_hpg_core/command_queue_xe_hpg_core.cpp b/opencl/source/xe_hpg_core/command_queue_xe_hpg_core.cpp index 3f0647ce99..3a24e79b61 100644 --- a/opencl/source/xe_hpg_core/command_queue_xe_hpg_core.cpp +++ b/opencl/source/xe_hpg_core/command_queue_xe_hpg_core.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2021-2022 Intel Corporation + * Copyright (C) 2021-2023 Intel Corporation * * SPDX-License-Identifier: MIT * diff --git a/opencl/test/unit_test/command_queue/command_queue_hw_1_tests.cpp b/opencl/test/unit_test/command_queue/command_queue_hw_1_tests.cpp index 782b71dfc9..c369349555 100644 --- a/opencl/test/unit_test/command_queue/command_queue_hw_1_tests.cpp +++ b/opencl/test/unit_test/command_queue/command_queue_hw_1_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2022 Intel Corporation + * Copyright (C) 2018-2023 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -19,6 +19,7 @@ #include "opencl/test/unit_test/command_queue/command_queue_fixture.h" #include "opencl/test/unit_test/fixtures/buffer_fixture.h" #include "opencl/test/unit_test/fixtures/image_fixture.h" +#include "opencl/test/unit_test/helpers/cl_hw_parse.h" #include "opencl/test/unit_test/mocks/mock_command_queue.h" #include "opencl/test/unit_test/mocks/mock_event.h" #include "opencl/test/unit_test/mocks/mock_kernel.h" @@ -193,6 +194,7 @@ class MockCommandStreamReceiverWithFailingFlushBatchedSubmission : public MockCo template struct MockCommandQueueHwWithOverwrittenCsr : public CommandQueueHw { using CommandQueueHw::CommandQueueHw; + using CommandQueueHw::timestampPacketContainer; MockCommandStreamReceiverWithFailingFlushBatchedSubmission *csr; CommandStreamReceiver &getGpgpuCommandStreamReceiver() const override { return *csr; } }; diff --git a/opencl/test/unit_test/command_queue/command_queue_hw_2_tests.cpp b/opencl/test/unit_test/command_queue/command_queue_hw_2_tests.cpp index d0f03f32a6..a175e2e297 100644 --- a/opencl/test/unit_test/command_queue/command_queue_hw_2_tests.cpp +++ b/opencl/test/unit_test/command_queue/command_queue_hw_2_tests.cpp @@ -21,6 +21,7 @@ #include "opencl/test/unit_test/mocks/mock_command_queue.h" #include "opencl/test/unit_test/mocks/mock_event.h" #include "opencl/test/unit_test/mocks/mock_kernel.h" +#include "opencl/test/unit_test/mocks/mock_mdi.h" using namespace NEO; @@ -970,4 +971,4 @@ HWTEST_F(CommandQueueHwTest, GivenBuiltinKernelWhenBuiltinDispatchInfoBuilderIsP EXPECT_EQ(builder.paramsToUse.elws.x, dispatchInfo->getEnqueuedWorkgroupSize().x); EXPECT_EQ(builder.paramsToUse.offset.x, dispatchInfo->getOffset().x); EXPECT_EQ(builder.paramsToUse.kernel, dispatchInfo->getKernel()); -} +} \ No newline at end of file diff --git a/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp b/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp index 3f51927119..82a1c38e12 100644 --- a/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp +++ b/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp @@ -31,6 +31,7 @@ #include "opencl/test/unit_test/fixtures/cl_device_fixture.h" #include "opencl/test/unit_test/mocks/mock_buffer.h" #include "opencl/test/unit_test/mocks/mock_command_queue.h" +#include "opencl/test/unit_test/mocks/mock_event.h" #include "opencl/test/unit_test/mocks/mock_kernel.h" #include "opencl/test/unit_test/mocks/mock_mdi.h" #include "opencl/test/unit_test/mocks/mock_program.h" diff --git a/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_and_later.cpp b/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_and_later.cpp index 10d0e767c2..9ab8a1218e 100644 --- a/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_and_later.cpp +++ b/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_and_later.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2021-2022 Intel Corporation + * Copyright (C) 2021-2023 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -557,11 +557,11 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, givenTimestamp MockMultiDispatchInfo multiDispatchInfo(device.get(), std::vector({kernel1.mockKernel, kernel2.mockKernel})); device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = false; - getCommandStream(cmdQ, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false); + getCommandStream(cmdQ, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false, nullptr); size_t sizeWithDisabled = cmdQ.requestedCmdStreamSize; device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; - getCommandStream(cmdQ, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false); + getCommandStream(cmdQ, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false, nullptr); size_t sizeWithEnabled = cmdQ.requestedCmdStreamSize; size_t additionalSize = 0u; @@ -669,7 +669,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, givenAutoLocal EXPECT_EQ((uint32_t)(expectedKernelStartOffset), idd.getKernelStartPointer()); auto expectedSizeCS = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, CsrDependencies(), false, false, - false, *cmdQ.get(), multiDispatchInfo, false, false); + false, *cmdQ.get(), multiDispatchInfo, false, false, nullptr); expectedSizeCS += sizeof(typename FamilyType::MI_BATCH_BUFFER_END); expectedSizeCS = alignUp(expectedSizeCS, MemoryConstants::cacheLineSize); EXPECT_GE(expectedSizeCS, usedAfterCS - usedBeforeCS); @@ -738,7 +738,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, givenPassInlin EXPECT_EQ((uint32_t)(expectedKernelStartOffset), idd.getKernelStartPointer()); auto expectedSizeCS = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, CsrDependencies(), false, false, - false, *cmdQ.get(), multiDispatchInfo, false, false); + false, *cmdQ.get(), multiDispatchInfo, false, false, nullptr); expectedSizeCS += sizeof(typename FamilyType::MI_BATCH_BUFFER_END); expectedSizeCS = alignUp(expectedSizeCS, MemoryConstants::cacheLineSize); EXPECT_GE(expectedSizeCS, usedAfterCS - usedBeforeCS); diff --git a/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp index 088f24d413..81306777d4 100644 --- a/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp @@ -234,7 +234,7 @@ HWTEST_F(EnqueueHandlerTest, givenNonBlitPropertyWhenEnqueueIsBlockedThenDontReg Surface *surfaces[] = {nullptr}; mockCmdQ->enqueueBlocked(CL_COMMAND_MARKER, surfaces, size_t(0), multiDispatchInfo, timestampPacketDependencies, blockedCommandsData, enqueuePropertiesForDependencyFlush, eventsRequest, - eventBuilder, std::unique_ptr(nullptr), nullptr); + eventBuilder, std::unique_ptr(nullptr), nullptr, nullptr); EXPECT_FALSE(blockedCommandsDataForDependencyFlush->blitEnqueue); } @@ -267,7 +267,7 @@ HWTEST_F(EnqueueHandlerTest, givenBlitPropertyWhenEnqueueIsBlockedThenRegisterBl Surface *surfaces[] = {nullptr}; mockCmdQ->enqueueBlocked(CL_COMMAND_READ_BUFFER, surfaces, size_t(0), multiDispatchInfo, timestampPacketDependencies, blockedCommandsData, enqueuePropertiesForBlitEnqueue, eventsRequest, - eventBuilder, std::unique_ptr(nullptr), mockCmdQ->getBcsForAuxTranslation()); + eventBuilder, std::unique_ptr(nullptr), mockCmdQ->getBcsForAuxTranslation(), nullptr); EXPECT_TRUE(blockedCommandsDataForBlitEnqueue->blitEnqueue); EXPECT_EQ(blitProperties.srcAllocation, blockedCommandsDataForBlitEnqueue->blitPropertiesContainer.begin()->srcAllocation); EXPECT_EQ(blitProperties.dstAllocation, blockedCommandsDataForBlitEnqueue->blitPropertiesContainer.begin()->dstAllocation); @@ -351,7 +351,7 @@ HWTEST_F(DispatchFlagsBlitTests, givenBlitEnqueueWhenDispatchingCommandsWithoutK timestampPacketDependencies.cacheFlushNodes.add(mockCmdQ->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator()->getTag()); BlitProperties blitProperties = mockCmdQ->processDispatchForBlitEnqueue(bcsCsr, multiDispatchInfo, timestampPacketDependencies, - eventsRequest, &mockCmdQ->getCS(0), CL_COMMAND_READ_BUFFER, false); + eventsRequest, &mockCmdQ->getCS(0), CL_COMMAND_READ_BUFFER, false, nullptr); BlitPropertiesContainer blitPropertiesContainer; blitPropertiesContainer.push_back(blitProperties); @@ -395,7 +395,7 @@ HWTEST_F(DispatchFlagsBlitTests, givenN1EnabledWhenDispatchingWithoutKernelThenA mockCmdQ->obtainNewTimestampPacketNodes(1, timestampPacketDependencies.previousEnqueueNodes, true, bcsCsr); timestampPacketDependencies.cacheFlushNodes.add(mockCmdQ->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator()->getTag()); BlitProperties blitProperties = mockCmdQ->processDispatchForBlitEnqueue(bcsCsr, multiDispatchInfo, timestampPacketDependencies, - eventsRequest, &mockCmdQ->getCS(0), CL_COMMAND_READ_BUFFER, false); + eventsRequest, &mockCmdQ->getCS(0), CL_COMMAND_READ_BUFFER, false, nullptr); BlitPropertiesContainer blitPropertiesContainer; blitPropertiesContainer.push_back(blitProperties); @@ -441,7 +441,7 @@ HWTEST_F(DispatchFlagsTests, givenMockKernelWhenSettingAdditionalKernelExecInfoT std::vector v; pKernel->setAdditionalKernelExecInfo(123u); - std::unique_ptr cmd(new CommandComputeKernel(*mockCmdQ.get(), blockedCommandsData, v, false, false, false, std::move(printfHandler), PreemptionMode::Disabled, pKernel, 1)); + std::unique_ptr cmd(new CommandComputeKernel(*mockCmdQ.get(), blockedCommandsData, v, false, false, false, std::move(printfHandler), PreemptionMode::Disabled, pKernel, 1, nullptr)); cmd->submit(1u, false); EXPECT_EQ(mockCsr->passedDispatchFlags.additionalKernelExecInfo, 123u); diff --git a/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests_dg2_and_later.cpp b/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests_dg2_and_later.cpp index f51f97a02b..f3d47baf7a 100644 --- a/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests_dg2_and_later.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests_dg2_and_later.cpp @@ -54,7 +54,7 @@ HWTEST2_F(DispatchFlagsTests, whenSubmittingKernelWithAdditionalKernelExecInfoTh std::vector v; pKernel->setAdditionalKernelExecInfo(AdditionalKernelExecInfo::DisableOverdispatch); - std::unique_ptr cmd(new CommandComputeKernel(*mockCmdQ.get(), blockedCommandsData, v, false, false, false, std::move(printfHandler), PreemptionMode::Disabled, pKernel, 1)); + std::unique_ptr cmd(new CommandComputeKernel(*mockCmdQ.get(), blockedCommandsData, v, false, false, false, std::move(printfHandler), PreemptionMode::Disabled, pKernel, 1, nullptr)); cmd->submit(1u, false); EXPECT_EQ(mockCsr->passedDispatchFlags.additionalKernelExecInfo, AdditionalKernelExecInfo::DisableOverdispatch); diff --git a/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp index 4745488d67..9515fc57d7 100644 --- a/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp @@ -2003,10 +2003,10 @@ HWTEST_F(PauseOnGpuTests, givenGpuScratchWriteEnabledWhenEstimatingCommandStream dispatchInfo.setKernel(mockKernel.mockKernel); multiDispatchInfo.push(dispatchInfo); - auto baseCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false); + auto baseCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, nullptr); DebugManager.flags.GpuScratchRegWriteAfterWalker.set(1); - auto extendedCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false); + auto extendedCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, nullptr); EXPECT_EQ(baseCommandStreamSize + sizeof(typename FamilyType::MI_LOAD_REGISTER_IMM), extendedCommandStreamSize); } diff --git a/opencl/test/unit_test/command_queue/enqueue_kernel_2_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_kernel_2_tests.cpp index 49871b6595..39cf18a236 100644 --- a/opencl/test/unit_test/command_queue/enqueue_kernel_2_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_kernel_2_tests.cpp @@ -1014,8 +1014,8 @@ HWTEST_F(EnqueueKernelTest, givenTimestampWriteEnableWhenMarkerProfilingWithoutW dispatchInfo.setKernel(mockKernel.mockKernel); multiDispatchInfo.push(dispatchInfo); - auto baseCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false); - auto extendedCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, true, false); + auto baseCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, nullptr); + auto extendedCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, true, false, nullptr); EXPECT_EQ(baseCommandStreamSize + 4 * EncodeStoreMMIO::size + MemorySynchronizationCommands::getSizeForSingleBarrier(false), extendedCommandStreamSize); } @@ -1033,8 +1033,8 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, EnqueueKernelTest, givenTimestampWriteEnableOnMulti dispatchInfo.setKernel(mockKernel.mockKernel); multiDispatchInfo.push(dispatchInfo); - auto baseCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false); - auto extendedCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, true, false); + auto baseCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, nullptr); + auto extendedCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, true, false, nullptr); EXPECT_EQ(baseCommandStreamSize + 4 * EncodeStoreMMIO::size + ImplicitScalingDispatch::getBarrierSize(csr.peekHwInfo(), false, false), extendedCommandStreamSize); } @@ -1047,8 +1047,8 @@ HWTEST_F(EnqueueKernelTest, givenTimestampWriteEnableWhenMarkerProfilingWithWait dispatchInfo.setKernel(mockKernel.mockKernel); multiDispatchInfo.push(dispatchInfo); - auto baseCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false); - auto extendedCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, true, true); + auto baseCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, nullptr); + auto extendedCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, true, true, nullptr); EXPECT_EQ(baseCommandStreamSize + 4 * EncodeStoreMMIO::size, extendedCommandStreamSize); } diff --git a/opencl/test/unit_test/command_queue/get_size_required_buffer_tests.cpp b/opencl/test/unit_test/command_queue/get_size_required_buffer_tests.cpp index 68e1038d82..f0d5895991 100644 --- a/opencl/test/unit_test/command_queue/get_size_required_buffer_tests.cpp +++ b/opencl/test/unit_test/command_queue/get_size_required_buffer_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2022 Intel Corporation + * Copyright (C) 2018-2023 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -23,6 +23,7 @@ #include "opencl/test/unit_test/fixtures/hello_world_kernel_fixture.h" #include "opencl/test/unit_test/fixtures/image_fixture.h" #include "opencl/test/unit_test/fixtures/simple_arg_kernel_fixture.h" +#include "opencl/test/unit_test/mocks/mock_event.h" using namespace NEO; @@ -96,7 +97,7 @@ HWTEST_F(GetSizeRequiredBufferTest, WhenFillingBufferThenHeapsAndCommandBufferCo auto usedAfterSSH = ssh.getUsed(); auto expectedSizeCS = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_FILL_BUFFER, CsrDependencies(), false, false, - false, *pCmdQ, multiDispatchInfo, false, false); + false, *pCmdQ, multiDispatchInfo, false, false, nullptr); auto expectedSizeDSH = HardwareCommandsHelper::getTotalSizeRequiredDSH(multiDispatchInfo); auto expectedSizeIOH = HardwareCommandsHelper::getTotalSizeRequiredIOH(multiDispatchInfo); auto expectedSizeSSH = HardwareCommandsHelper::getTotalSizeRequiredSSH(multiDispatchInfo); @@ -149,7 +150,7 @@ HWTEST_F(GetSizeRequiredBufferTest, WhenCopyingBufferThenHeapsAndCommandBufferCo auto usedAfterSSH = ssh.getUsed(); auto expectedSizeCS = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_COPY_BUFFER, CsrDependencies(), false, false, - false, *pCmdQ, multiDispatchInfo, false, false); + false, *pCmdQ, multiDispatchInfo, false, false, nullptr); auto expectedSizeDSH = HardwareCommandsHelper::getTotalSizeRequiredDSH(multiDispatchInfo); auto expectedSizeIOH = HardwareCommandsHelper::getTotalSizeRequiredIOH(multiDispatchInfo); auto expectedSizeSSH = HardwareCommandsHelper::getTotalSizeRequiredSSH(multiDispatchInfo); @@ -203,7 +204,7 @@ HWTEST_F(GetSizeRequiredBufferTest, WhenReadingBufferNonBlockingThenHeapsAndComm auto usedAfterSSH = ssh.getUsed(); auto expectedSizeCS = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_READ_BUFFER, CsrDependencies(), false, false, - false, *pCmdQ, multiDispatchInfo, false, false); + false, *pCmdQ, multiDispatchInfo, false, false, nullptr); auto expectedSizeDSH = HardwareCommandsHelper::getTotalSizeRequiredDSH(multiDispatchInfo); auto expectedSizeIOH = HardwareCommandsHelper::getTotalSizeRequiredIOH(multiDispatchInfo); auto expectedSizeSSH = HardwareCommandsHelper::getTotalSizeRequiredSSH(multiDispatchInfo); @@ -258,7 +259,7 @@ HWTEST_F(GetSizeRequiredBufferTest, WhenReadingBufferBlockingThenThenHeapsAndCom auto usedAfterSSH = ssh.getUsed(); auto expectedSizeCS = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_READ_BUFFER, CsrDependencies(), false, false, - false, *pCmdQ, multiDispatchInfo, false, false); + false, *pCmdQ, multiDispatchInfo, false, false, nullptr); auto expectedSizeDSH = HardwareCommandsHelper::getTotalSizeRequiredDSH(multiDispatchInfo); auto expectedSizeIOH = HardwareCommandsHelper::getTotalSizeRequiredIOH(multiDispatchInfo); auto expectedSizeSSH = HardwareCommandsHelper::getTotalSizeRequiredSSH(multiDispatchInfo); @@ -313,7 +314,7 @@ HWTEST_F(GetSizeRequiredBufferTest, WhenWritingBufferNonBlockingThenHeapsAndComm auto usedAfterSSH = ssh.getUsed(); auto expectedSizeCS = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_WRITE_BUFFER, CsrDependencies(), false, false, - false, *pCmdQ, multiDispatchInfo, false, false); + false, *pCmdQ, multiDispatchInfo, false, false, nullptr); auto expectedSizeDSH = HardwareCommandsHelper::getTotalSizeRequiredDSH(multiDispatchInfo); auto expectedSizeIOH = HardwareCommandsHelper::getTotalSizeRequiredIOH(multiDispatchInfo); auto expectedSizeSSH = HardwareCommandsHelper::getTotalSizeRequiredSSH(multiDispatchInfo); @@ -365,7 +366,7 @@ HWTEST_F(GetSizeRequiredBufferTest, WhenWritingBufferBlockingThenHeapsAndCommand auto usedAfterSSH = ssh.getUsed(); auto expectedSizeCS = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_WRITE_BUFFER, CsrDependencies(), false, false, - false, *pCmdQ, multiDispatchInfo, false, false); + false, *pCmdQ, multiDispatchInfo, false, false, nullptr); auto expectedSizeDSH = HardwareCommandsHelper::getTotalSizeRequiredDSH(multiDispatchInfo); auto expectedSizeIOH = HardwareCommandsHelper::getTotalSizeRequiredIOH(multiDispatchInfo); auto expectedSizeSSH = HardwareCommandsHelper::getTotalSizeRequiredSSH(multiDispatchInfo); @@ -380,6 +381,68 @@ HWTEST_F(GetSizeRequiredBufferTest, WhenWritingBufferBlockingThenHeapsAndCommand EXPECT_GE(expectedSizeSSH, usedAfterSSH - usedBeforeSSH); } +HWTEST_F(GetSizeRequiredBufferTest, GivenOutEventForSingleDeviceContextWhenCalculatingCSSizeThenExtraPipeControlIsNotAdded) { + UltClDeviceFactory deviceFactory{1, 0}; + DebugManager.flags.EnableMultiRootDeviceContexts.set(true); + + cl_device_id devices[] = {deviceFactory.rootDevices[0]}; + + MockContext pContext(ClDeviceVector(devices, 1)); + MockKernelWithInternals mockKernel(*pContext.getDevices()[0]); + DispatchInfo dispatchInfo; + MultiDispatchInfo multiDispatchInfo(mockKernel.mockKernel); + dispatchInfo.setKernel(mockKernel.mockKernel); + multiDispatchInfo.push(dispatchInfo); + auto event = std::make_unique>(&pContext, nullptr, 0, 0, 0); + cl_event clEvent = event.get(); + auto baseCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, nullptr); + auto extendedCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, &clEvent); + + EXPECT_EQ(baseCommandStreamSize, extendedCommandStreamSize); +} + +HWTEST_F(GetSizeRequiredBufferTest, GivenUserEventForMultiDeviceContextWhenCalculatingCSSizeThenExtraPipeControlIsNotAdded) { + UltClDeviceFactory deviceFactory{2, 0}; + DebugManager.flags.EnableMultiRootDeviceContexts.set(true); + + cl_device_id devices[] = {deviceFactory.rootDevices[0], + deviceFactory.rootDevices[1]}; + + MockContext pContext(ClDeviceVector(devices, 2)); + MockKernelWithInternals mockKernel(*pContext.getDevices()[0]); + DispatchInfo dispatchInfo; + MultiDispatchInfo multiDispatchInfo(mockKernel.mockKernel); + dispatchInfo.setKernel(mockKernel.mockKernel); + multiDispatchInfo.push(dispatchInfo); + auto userEvent1 = std::make_unique(&pContext); + cl_event clEvent = userEvent1.get(); + auto baseCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, nullptr); + auto extendedCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, &clEvent); + + EXPECT_EQ(baseCommandStreamSize, extendedCommandStreamSize); +} + +HWTEST_F(GetSizeRequiredBufferTest, GivenOutEventForMultiDeviceContextWhenCalculatingCSSizeThenExtraPipeControlIsAdded) { + UltClDeviceFactory deviceFactory{2, 0}; + DebugManager.flags.EnableMultiRootDeviceContexts.set(true); + + cl_device_id devices[] = {deviceFactory.rootDevices[0], + deviceFactory.rootDevices[1]}; + + MockContext pContext(ClDeviceVector(devices, 2)); + MockKernelWithInternals mockKernel(*pContext.getDevices()[0]); + DispatchInfo dispatchInfo; + MultiDispatchInfo multiDispatchInfo(mockKernel.mockKernel); + dispatchInfo.setKernel(mockKernel.mockKernel); + multiDispatchInfo.push(dispatchInfo); + auto event = std::make_unique>(&pContext, nullptr, 0, 0, 0); + cl_event clEvent = event.get(); + auto baseCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, nullptr); + auto extendedCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, &clEvent); + + EXPECT_EQ(baseCommandStreamSize + MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(pContext.getDevices()[0]->getHardwareInfo(), false), extendedCommandStreamSize); +} + HWTEST_F(GetSizeRequiredBufferTest, givenMultipleKernelRequiringSshWhenTotalSizeIsComputedThenItIsProperlyAligned) { auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(EBuiltInOps::CopyBufferToBuffer, pCmdQ->getClDevice()); diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_3_tests.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_3_tests.cpp index 00f97abef7..b787c05505 100644 --- a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_3_tests.cpp +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_3_tests.cpp @@ -1903,7 +1903,7 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, GivenBlockedKernelWhenItIsUnblocke blockedCommandsData->setHeaps(dsh, ioh, ssh); std::vector surfaces; - event->setCommand(std::make_unique(*pCmdQ, blockedCommandsData, surfaces, false, false, false, nullptr, pDevice->getPreemptionMode(), pKernel, 1)); + event->setCommand(std::make_unique(*pCmdQ, blockedCommandsData, surfaces, false, false, false, nullptr, pDevice->getPreemptionMode(), pKernel, 1, nullptr)); event->submitCommand(false); EXPECT_EQ(numGrfRequired, csr->savedDispatchFlags.numGrfRequired); @@ -1948,7 +1948,7 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, GivenBlockedKernelWhenInitializeBc auto blockedCommandsData = std::make_unique(cmdStream, *pCmdQ->getGpgpuCommandStreamReceiver().getInternalAllocationStorage()); std::vector surfaces; - event->setCommand(std::make_unique(*pCmdQ, blockedCommandsData, surfaces, false, false, false, nullptr, pDevice->getPreemptionMode(), pKernel, 1)); + event->setCommand(std::make_unique(*pCmdQ, blockedCommandsData, surfaces, false, false, false, nullptr, pDevice->getPreemptionMode(), pKernel, 1, nullptr)); event->submitCommand(false); EXPECT_FALSE(pCmdQ->isCsrLocked); } diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_4_tests.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_4_tests.cpp index 92b41058bb..16ca22431d 100644 --- a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_4_tests.cpp +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_4_tests.cpp @@ -7,6 +7,7 @@ #include "shared/source/command_stream/wait_status.h" #include "shared/test/common/mocks/mock_command_stream_receiver.h" +#include "shared/test/common/mocks/mock_timestamp_container.h" #include "shared/test/common/mocks/ult_device_factory.h" #include "shared/test/common/test_macros/hw_test.h" @@ -14,6 +15,7 @@ #include "opencl/source/event/user_event.h" #include "opencl/test/unit_test/fixtures/multi_root_device_fixture.h" #include "opencl/test/unit_test/fixtures/ult_command_stream_receiver_fixture.h" +#include "opencl/test/unit_test/mocks/mock_event.h" #include "opencl/test/unit_test/mocks/mock_kernel.h" #include "opencl/test/unit_test/mocks/mock_program.h" #include "opencl/test/unit_test/test_macros/test_checks_ocl.h" @@ -45,12 +47,18 @@ HWTEST_F(MultiRootDeviceCommandStreamReceiverBufferTests, givenMultipleEventInMu MockGraphicsAllocation svmAlloc(svmPtr, svmSize); Event event1(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 5, 15); + auto node1 = event1.getMultiRootTimestampSyncNode(); Event event2(nullptr, CL_COMMAND_NDRANGE_KERNEL, 6, 16); Event event3(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 4, 20); + auto node3 = event3.getMultiRootTimestampSyncNode(); Event event4(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 3, 4); + auto node4 = event4.getMultiRootTimestampSyncNode(); Event event5(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 2, 7); + auto node5 = event5.getMultiRootTimestampSyncNode(); UserEvent userEvent1(&pCmdQ1->getContext()); + userEvent1.getMultiRootTimestampSyncNode(); UserEvent userEvent2(&pCmdQ2->getContext()); + userEvent2.getMultiRootTimestampSyncNode(); userEvent1.setStatus(CL_COMPLETE); userEvent2.setStatus(CL_COMPLETE); @@ -87,12 +95,12 @@ HWTEST_F(MultiRootDeviceCommandStreamReceiverBufferTests, givenMultipleEventInMu EXPECT_EQ(2u, semaphores.size()); auto semaphoreCmd0 = genCmdCast(*(semaphores[0])); - EXPECT_EQ(4u, semaphoreCmd0->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ2->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd0->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(node4->getContextEndAddress(0u)), semaphoreCmd0->getSemaphoreGraphicsAddress()); auto semaphoreCmd1 = genCmdCast(*(semaphores[1])); - EXPECT_EQ(7u, semaphoreCmd1->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ2->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd1->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd1->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(node5->getContextEndAddress(0u)), semaphoreCmd1->getSemaphoreGraphicsAddress()); } { @@ -115,12 +123,12 @@ HWTEST_F(MultiRootDeviceCommandStreamReceiverBufferTests, givenMultipleEventInMu EXPECT_EQ(2u, semaphores.size()); auto semaphoreCmd0 = genCmdCast(*(semaphores[0])); - EXPECT_EQ(15u, semaphoreCmd0->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd0->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(node1->getContextEndAddress(0u)), semaphoreCmd0->getSemaphoreGraphicsAddress()); auto semaphoreCmd1 = genCmdCast(*(semaphores[1])); - EXPECT_EQ(20u, semaphoreCmd1->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd1->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd1->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(node3->getContextEndAddress(0u)), semaphoreCmd1->getSemaphoreGraphicsAddress()); } alignedFree(svmPtr); } @@ -147,17 +155,24 @@ HWTEST_F(MultiRootDeviceCommandStreamReceiverTests, givenMultipleEventInMultiRoo cl_device_id devices[] = {device1, device2, device3}; auto context = std::make_unique(ClDeviceVector(devices, 3), false); - + auto mockTagAllocator = std::make_unique>(context->getRootDeviceIndices(), device1->getExecutionEnvironment()->memoryManager.get(), 10u); + std::unique_ptr uniquePtr(mockTagAllocator.release()); + context->setMultiRootDeviceTimestampPacketAllocator(uniquePtr); auto pCmdQ1 = context->getSpecialQueue(1u); auto pCmdQ2 = context->getSpecialQueue(2u); auto pCmdQ3 = context->getSpecialQueue(3u); Event event1(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 5, 15); + auto node1 = event1.getMultiRootTimestampSyncNode(); Event event2(nullptr, CL_COMMAND_NDRANGE_KERNEL, 6, 16); Event event3(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 4, 20); + auto node3 = event3.getMultiRootTimestampSyncNode(); Event event4(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 3, 4); + auto node4 = event4.getMultiRootTimestampSyncNode(); Event event5(pCmdQ3, CL_COMMAND_NDRANGE_KERNEL, 7, 21); + auto node5 = event5.getMultiRootTimestampSyncNode(); Event event6(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 2, 7); + auto node6 = event6.getMultiRootTimestampSyncNode(); UserEvent userEvent1(&pCmdQ1->getContext()); UserEvent userEvent2(&pCmdQ2->getContext()); @@ -190,16 +205,16 @@ HWTEST_F(MultiRootDeviceCommandStreamReceiverTests, givenMultipleEventInMultiRoo EXPECT_EQ(3u, semaphores.size()); auto semaphoreCmd0 = genCmdCast(*(semaphores[0])); - EXPECT_EQ(4u, semaphoreCmd0->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ2->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd0->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(node4->getContextEndAddress(0u)), semaphoreCmd0->getSemaphoreGraphicsAddress()); auto semaphoreCmd1 = genCmdCast(*(semaphores[1])); - EXPECT_EQ(21u, semaphoreCmd1->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ3->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd1->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd1->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(node5->getContextEndAddress(0u)), semaphoreCmd1->getSemaphoreGraphicsAddress()); auto semaphoreCmd2 = genCmdCast(*(semaphores[2])); - EXPECT_EQ(7u, semaphoreCmd2->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ2->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd2->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd2->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(node6->getContextEndAddress(0u)), semaphoreCmd2->getSemaphoreGraphicsAddress()); } { @@ -215,16 +230,16 @@ HWTEST_F(MultiRootDeviceCommandStreamReceiverTests, givenMultipleEventInMultiRoo EXPECT_EQ(3u, semaphores.size()); auto semaphoreCmd0 = genCmdCast(*(semaphores[0])); - EXPECT_EQ(15u, semaphoreCmd0->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd0->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(node1->getContextEndAddress(0u)), semaphoreCmd0->getSemaphoreGraphicsAddress()); auto semaphoreCmd1 = genCmdCast(*(semaphores[1])); - EXPECT_EQ(20u, semaphoreCmd1->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd1->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd1->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(node3->getContextEndAddress(0u)), semaphoreCmd1->getSemaphoreGraphicsAddress()); auto semaphoreCmd2 = genCmdCast(*(semaphores[2])); - EXPECT_EQ(21u, semaphoreCmd2->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ3->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd2->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd2->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(node5->getContextEndAddress(0u)), semaphoreCmd2->getSemaphoreGraphicsAddress()); } { @@ -249,8 +264,8 @@ HWTEST_F(MultiRootDeviceCommandStreamReceiverTests, givenMultipleEventInMultiRoo EXPECT_EQ(1u, semaphores.size()); auto semaphoreCmd0 = genCmdCast(*(semaphores[0])); - EXPECT_EQ(15u, semaphoreCmd0->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd0->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(node1->getContextEndAddress(0u)), semaphoreCmd0->getSemaphoreGraphicsAddress()); } } @@ -286,11 +301,16 @@ HWTEST_F(CrossDeviceDependenciesTests, givenMultipleEventInMultiRootDeviceEnviro using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; Event event1(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 5, 15); + event1.getMultiRootTimestampSyncNode(); Event event2(nullptr, CL_COMMAND_NDRANGE_KERNEL, 6, 16); Event event3(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 1, 6); + event3.getMultiRootTimestampSyncNode(); Event event4(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 4, 20); + event4.getMultiRootTimestampSyncNode(); Event event5(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 3, 4); + event5.getMultiRootTimestampSyncNode(); Event event6(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 2, 7); + event6.getMultiRootTimestampSyncNode(); UserEvent userEvent1(&pCmdQ1->getContext()); UserEvent userEvent2(&pCmdQ2->getContext()); @@ -316,10 +336,10 @@ HWTEST_F(CrossDeviceDependenciesTests, givenMultipleEventInMultiRootDeviceEnviro EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, nullptr); CsrDependencies csrDeps; - eventsRequest.fillCsrDependenciesForTaskCountContainer(csrDeps, pCmdQ1->getGpgpuCommandStreamReceiver()); + eventsRequest.fillCsrDependenciesForRootDevices(csrDeps, pCmdQ1->getGpgpuCommandStreamReceiver()); - EXPECT_EQ(0u, csrDeps.taskCountContainer.size()); - EXPECT_EQ(0u, TimestampPacketHelper::getRequiredCmdStreamSizeForTaskCountContainer(csrDeps)); + EXPECT_EQ(0u, csrDeps.multiRootTimeStampSyncContainer.size()); + EXPECT_EQ(0u, TimestampPacketHelper::getRequiredCmdStreamSizeForMultiRootDeviceSyncNodesContainer(csrDeps)); } { @@ -342,10 +362,10 @@ HWTEST_F(CrossDeviceDependenciesTests, givenMultipleEventInMultiRootDeviceEnviro EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, nullptr); CsrDependencies csrDeps; - eventsRequest.fillCsrDependenciesForTaskCountContainer(csrDeps, pCmdQ2->getGpgpuCommandStreamReceiver()); + eventsRequest.fillCsrDependenciesForRootDevices(csrDeps, pCmdQ2->getGpgpuCommandStreamReceiver()); - EXPECT_EQ(3u, csrDeps.taskCountContainer.size()); - EXPECT_EQ(3u * sizeof(MI_SEMAPHORE_WAIT), TimestampPacketHelper::getRequiredCmdStreamSizeForTaskCountContainer(csrDeps)); + EXPECT_EQ(3u, csrDeps.multiRootTimeStampSyncContainer.size()); + EXPECT_EQ(3u * sizeof(MI_SEMAPHORE_WAIT), TimestampPacketHelper::getRequiredCmdStreamSizeForMultiRootDeviceSyncNodesContainer(csrDeps)); } } @@ -376,6 +396,7 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW cl_event outputEvent2{}; + auto currentCsUsedCmdq1 = pCmdQ1->getCS(0).getUsed(); pCmdQ2->enqueueReadBuffer(buffer, CL_FALSE, 0, MemoryConstants::pageSize, hostPtr, nullptr, 1, &outputEvent1, @@ -399,14 +420,12 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW nullptr); { HardwareParse csHwParser; - csHwParser.parseCommands(pCmdQ1->getCS(0)); + csHwParser.parseCommands(pCmdQ1->getCS(0), currentCsUsedCmdq1); auto semaphores = findAll(csHwParser.cmdList.begin(), csHwParser.cmdList.end()); EXPECT_EQ(0u, semaphores.size()); } userEvent1.setStatus(CL_COMPLETE); - event1->release(); - event2->release(); pCmdQ1->finish(); pCmdQ2->finish(); { @@ -417,7 +436,7 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW EXPECT_EQ(1u, semaphores.size()); auto semaphoreCmd = genCmdCast(*(semaphores[0])); EXPECT_EQ(1u, semaphoreCmd->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ2->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd->getSemaphoreGraphicsAddress()); + EXPECT_EQ(reinterpret_cast(event2->getMultiRootDeviceTimestampPacketNodes()->peekNodes().at(0)->getContextEndAddress(0u)), semaphoreCmd->getSemaphoreGraphicsAddress()); } { HardwareParse csHwParser; @@ -426,9 +445,11 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW EXPECT_EQ(1u, semaphores.size()); auto semaphoreCmd = genCmdCast(*(semaphores[0])); - EXPECT_EQ(0u, semaphoreCmd->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(event1->getMultiRootDeviceTimestampPacketNodes()->peekNodes().at(0)->getContextEndAddress(0u)), semaphoreCmd->getSemaphoreGraphicsAddress()); } + event1->release(); + event2->release(); buffer->release(); } @@ -458,14 +479,14 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW char hostPtr[MemoryConstants::pageSize]{}; cl_event outputEvent2{}; - + auto currentCsUsed = pCmdQ1->getCS(0).getUsed(); pCmdQ1->enqueueReadBuffer(buffer, CL_FALSE, 0, MemoryConstants::pageSize, hostPtr, nullptr, 1, &outputEvent1, &outputEvent2); { HardwareParse csHwParser; - csHwParser.parseCommands(pCmdQ1->getCS(0)); + csHwParser.parseCommands(pCmdQ1->getCS(0), currentCsUsed); auto semaphores = findAll(csHwParser.cmdList.begin(), csHwParser.cmdList.end()); EXPECT_EQ(0u, semaphores.size()); @@ -482,7 +503,7 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW nullptr); { HardwareParse csHwParser; - csHwParser.parseCommands(pCmdQ1->getCS(0)); + csHwParser.parseCommands(pCmdQ1->getCS(0), currentCsUsed); auto semaphores = findAll(csHwParser.cmdList.begin(), csHwParser.cmdList.end()); EXPECT_EQ(0u, semaphores.size()); @@ -590,9 +611,6 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW EXPECT_EQ(0u, semaphores.size()); } userEvent1.setStatus(CL_COMPLETE); - event1->release(); - event2->release(); - event3->release(); pCmdQ1->finish(); pCmdQ2->finish(); @@ -604,7 +622,8 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW EXPECT_EQ(1u, semaphores.size()); auto semaphoreCmd = genCmdCast(*(semaphores[0])); EXPECT_EQ(1u, semaphoreCmd->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ2->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd->getSemaphoreGraphicsAddress()); + auto node = event2->getMultiRootDeviceTimestampPacketNodes()->peekNodes().at(0); + EXPECT_EQ(node->getGpuAddress() + node->getContextEndOffset(), semaphoreCmd->getSemaphoreGraphicsAddress()); } { HardwareParse csHwParser; @@ -620,8 +639,9 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW EXPECT_EQ(2u, semaphores.size()); auto semaphoreCmd0 = genCmdCast(*(semaphores[0])); - EXPECT_EQ(0u, semaphoreCmd0->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd0->getSemaphoreDataDword()); + auto node = event1->getMultiRootDeviceTimestampPacketNodes()->peekNodes().at(0); + EXPECT_EQ(node->getGpuAddress() + node->getContextEndOffset(), semaphoreCmd0->getSemaphoreGraphicsAddress()); } { HardwareParse csHwParser; @@ -630,6 +650,9 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW EXPECT_LE(1u, semaphores.size()); } + event1->release(); + event2->release(); + event3->release(); buffer->release(); pCmdQ1->release(); pCmdQ2->release(); @@ -961,3 +984,73 @@ HWTEST_F(BcsCrossDeviceMigrationTests, givenBufferWithMultiStorageWhenEnqueueRea EXPECT_EQ(buffer.get(), cmdQueue->migrateMultiGraphicsAllocationsReceivedOperationParams.srcMemObj); } + +HWTEST_F(CrossDeviceDependenciesTests, givenMultipleEventInMultiRootDeviceEnvironmentWhenTheyDoNotHaveMultiRootSyncNodeThenCsrDepsDoesNotHaveAnyMultiRootSyncContainer) { + Event event1(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 5, 15); + Event event2(nullptr, CL_COMMAND_NDRANGE_KERNEL, 6, 16); + Event event3(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 1, 6); + Event event4(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 4, 20); + Event event5(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 3, 4); + Event event6(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 2, 7); + UserEvent userEvent1(&pCmdQ1->getContext()); + UserEvent userEvent2(&pCmdQ2->getContext()); + + userEvent1.setStatus(CL_COMPLETE); + userEvent2.setStatus(CL_COMPLETE); + { + cl_event eventWaitList[] = + { + &event1, + &event2, + &event3, + &event4, + &event5, + &event6, + &userEvent1, + }; + cl_uint numEventsInWaitList = sizeof(eventWaitList) / sizeof(eventWaitList[0]); + + EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, nullptr); + CsrDependencies csrDeps; + eventsRequest.fillCsrDependenciesForRootDevices(csrDeps, pCmdQ2->getGpgpuCommandStreamReceiver()); + + EXPECT_EQ(0u, csrDeps.multiRootTimeStampSyncContainer.size()); + } +} +HWTEST_F(CrossDeviceDependenciesTests, givenMultipleEventInMultiRootDeviceEnvironmentWhenTheyDoNotHaveMultiRootSyncNodeContainersThenCsrDepsDoesNotHaveAnyMultiRootSyncContainer) { + + MockEvent event1(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 5, 15); + event1.multiRootDeviceTimestampPacketContainer.reset(new TimestampPacketContainer()); + MockEvent event2(nullptr, CL_COMMAND_NDRANGE_KERNEL, 6, 16); + MockEvent event3(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 4, 20); + event3.multiRootDeviceTimestampPacketContainer.reset(new TimestampPacketContainer()); + MockEvent event4(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 3, 4); + event4.multiRootDeviceTimestampPacketContainer.reset(new TimestampPacketContainer()); + MockEvent event5(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 3, 4); + event5.multiRootDeviceTimestampPacketContainer.reset(new TimestampPacketContainer()); + MockEvent event6(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 2, 7); + event6.multiRootDeviceTimestampPacketContainer.reset(new TimestampPacketContainer()); + UserEvent userEvent1(&pCmdQ1->getContext()); + + userEvent1.setStatus(CL_COMPLETE); + + { + cl_event eventWaitList[] = + { + &event1, + &event2, + &event3, + &event4, + &event5, + &event6, + &userEvent1, + }; + cl_uint numEventsInWaitList = sizeof(eventWaitList) / sizeof(eventWaitList[0]); + + EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, nullptr); + CsrDependencies csrDeps; + eventsRequest.fillCsrDependenciesForRootDevices(csrDeps, pCmdQ2->getGpgpuCommandStreamReceiver()); + + EXPECT_EQ(0u, csrDeps.multiRootTimeStampSyncContainer.size()); + } +} diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_1_tests.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_1_tests.cpp index 12caf89da5..504b4a885d 100644 --- a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_1_tests.cpp +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_1_tests.cpp @@ -11,6 +11,7 @@ #include "shared/source/command_stream/wait_status.h" #include "shared/source/helpers/constants.h" #include "shared/source/helpers/logical_state_helper.h" +#include "shared/source/os_interface/device_factory.h" #include "shared/source/os_interface/hw_info_config.h" #include "shared/test/common/helpers/debug_manager_state_restore.h" #include "shared/test/common/helpers/engine_descriptor_helper.h" diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_2_tests.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_2_tests.cpp index 33f481cb0d..c73bd35dec 100644 --- a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_2_tests.cpp +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_2_tests.cpp @@ -1791,4 +1791,4 @@ HWTEST_F(BcsTests, givenHostPtrToImageWhenBlitBufferIsCalledThenBlitCmdIsFound) hwParser.parseCommands(csr.commandStream, 0); auto cmdIterator = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); EXPECT_NE(hwParser.cmdList.end(), cmdIterator); -} +} \ No newline at end of file diff --git a/opencl/test/unit_test/event/event_builder_tests.cpp b/opencl/test/unit_test/event/event_builder_tests.cpp index 28d1bcb19b..ff0cda16de 100644 --- a/opencl/test/unit_test/event/event_builder_tests.cpp +++ b/opencl/test/unit_test/event/event_builder_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2022 Intel Corporation + * Copyright (C) 2018-2023 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -79,7 +79,7 @@ TEST(EventBuilder, givenVirtualEventWithCommandThenFinalizeAddChild) { public: using CommandComputeKernel::eventsWaitlist; MockCommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr &kernelOperation, std::vector &surfaces, Kernel *kernel) - : CommandComputeKernel(commandQueue, kernelOperation, surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0) {} + : CommandComputeKernel(commandQueue, kernelOperation, surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0, nullptr) {} }; auto device = std::make_unique(MockDevice::createWithNewExecutionEnvironment(defaultHwInfo.get())); @@ -129,7 +129,7 @@ TEST(EventBuilder, givenVirtualEventWithSubmittedCommandAsParentThenFinalizeNotA public: using CommandComputeKernel::eventsWaitlist; MockCommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr &kernelOperation, std::vector &surfaces, Kernel *kernel) - : CommandComputeKernel(commandQueue, kernelOperation, surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0) {} + : CommandComputeKernel(commandQueue, kernelOperation, surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0, nullptr) {} }; auto device = std::make_unique(MockDevice::createWithNewExecutionEnvironment(defaultHwInfo.get())); diff --git a/opencl/test/unit_test/event/event_tests.cpp b/opencl/test/unit_test/event/event_tests.cpp index 021bee90d2..f359aedaa1 100644 --- a/opencl/test/unit_test/event/event_tests.cpp +++ b/opencl/test/unit_test/event/event_tests.cpp @@ -485,7 +485,7 @@ TEST_F(InternalsEventTest, GivenSubmitCommandFalseWhenSubmittingCommandsThenRefA PreemptionMode preemptionMode = pDevice->getPreemptionMode(); v.push_back(bufferSurf); - auto cmd = new CommandComputeKernel(cmdQ, blockedCommandsData, v, false, false, false, nullptr, preemptionMode, pKernel, 1); + auto cmd = new CommandComputeKernel(cmdQ, blockedCommandsData, v, false, false, false, nullptr, preemptionMode, pKernel, 1, nullptr); event.setCommand(std::unique_ptr(cmd)); auto taskLevelBefore = csr.peekTaskLevel(); @@ -528,7 +528,7 @@ TEST_F(InternalsEventTest, GivenSubmitCommandTrueWhenSubmittingCommandsThenRefAp NullSurface *surface = new NullSurface; v.push_back(surface); PreemptionMode preemptionMode = pDevice->getPreemptionMode(); - auto cmd = new CommandComputeKernel(cmdQ, blockedCommandsData, v, false, false, false, nullptr, preemptionMode, pKernel, 1); + auto cmd = new CommandComputeKernel(cmdQ, blockedCommandsData, v, false, false, false, nullptr, preemptionMode, pKernel, 1, nullptr); event.setCommand(std::unique_ptr(cmd)); auto taskLevelBefore = csr.peekTaskLevel(); @@ -579,7 +579,7 @@ TEST_F(InternalsEventTest, givenBlockedKernelWithPrintfWhenSubmittedThenPrintOut std::vector v; PreemptionMode preemptionMode = pDevice->getPreemptionMode(); - auto cmd = new CommandComputeKernel(mockCmdQueue, blockedCommandsData, v, false, false, false, std::move(printfHandler), preemptionMode, pKernel, 1); + auto cmd = new CommandComputeKernel(mockCmdQueue, blockedCommandsData, v, false, false, false, std::move(printfHandler), preemptionMode, pKernel, 1, nullptr); event.setCommand(std::unique_ptr(cmd)); event.submitCommand(false); @@ -631,7 +631,7 @@ TEST_F(InternalsEventTest, givenGpuHangOnCmdQueueWaitFunctionAndBlockedKernelWit std::vector v; PreemptionMode preemptionMode = pDevice->getPreemptionMode(); - auto cmd = new CommandComputeKernel(mockCmdQueue, blockedCommandsData, v, false, false, false, std::move(printfHandler), preemptionMode, pKernel, 1); + auto cmd = new CommandComputeKernel(mockCmdQueue, blockedCommandsData, v, false, false, false, std::move(printfHandler), preemptionMode, pKernel, 1, nullptr); event.setCommand(std::unique_ptr(cmd)); event.submitCommand(false); @@ -680,7 +680,7 @@ TEST_F(InternalsEventTest, givenGpuHangOnPrintingEnqueueOutputAndBlockedKernelWi std::vector v; PreemptionMode preemptionMode = pDevice->getPreemptionMode(); - auto cmd = new CommandComputeKernel(mockCmdQueue, blockedCommandsData, v, false, false, false, std::move(printfHandler), preemptionMode, pKernel, 1); + auto cmd = new CommandComputeKernel(mockCmdQueue, blockedCommandsData, v, false, false, false, std::move(printfHandler), preemptionMode, pKernel, 1, nullptr); event.setCommand(std::unique_ptr(cmd)); event.submitCommand(false); @@ -1169,7 +1169,7 @@ HWTEST_F(EventTest, givenVirtualEventWhenCommandSubmittedThenLockCsrOccurs) { public: using CommandComputeKernel::eventsWaitlist; MockCommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr &kernelOperation, std::vector &surfaces, Kernel *kernel) - : CommandComputeKernel(commandQueue, kernelOperation, surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0) {} + : CommandComputeKernel(commandQueue, kernelOperation, surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0, nullptr) {} }; class MockEvent : public Event { public: @@ -1750,7 +1750,7 @@ HWTEST_F(InternalsEventTest, givenAbortedCommandWhenSubmitCalledThenDontUpdateFl blockedCommandsData->setHeaps(dsh, ioh, ssh); PreemptionMode preemptionMode = pDevice->getPreemptionMode(); std::vector v; - auto cmd = new CommandComputeKernel(*pCmdQ, blockedCommandsData, v, false, false, false, nullptr, preemptionMode, pKernel, 1); + auto cmd = new CommandComputeKernel(*pCmdQ, blockedCommandsData, v, false, false, false, nullptr, preemptionMode, pKernel, 1, nullptr); event->setCommand(std::unique_ptr(cmd)); FlushStamp expectedFlushStamp = 0; @@ -1893,3 +1893,35 @@ TEST(EventTimestampTest, givenEnableTimestampWaitWhenCheckIsTimestampWaitEnabled EXPECT_TRUE(event.isWaitForTimestampsEnabled()); } } +TEST(MultiRootEvent, givenContextWithMultiRootTagAllocatorWhenEventGetsTagThenNewAllocatorIsNotCreated) { + auto mockDevice = std::make_unique(MockDevice::createWithNewExecutionEnvironment(defaultHwInfo.get())); + MockContext context{}; + MockCommandQueue cmdQ(&context, mockDevice.get(), 0, false); + auto allocator = cmdQ.getGpgpuCommandStreamReceiver().createMultiRootDeviceTimestampPacketAllocator(context.getRootDeviceIndices()); + auto allocatorPtr = allocator.get(); + context.setMultiRootDeviceTimestampPacketAllocator(allocator); + MockEvent event{&cmdQ, CL_COMMAND_MARKER, 0, 0}; + event.getMultiRootTimestampSyncNode(); + EXPECT_EQ(allocatorPtr, context.getMultiRootDeviceTimestampPacketAllocator()); +} +TEST(MultiRootEvent, givenContextWithoutMultiRootTagAllocatorWhenEventGetsTagThenNewAllocatorIsCreated) { + auto mockDevice = std::make_unique(MockDevice::createWithNewExecutionEnvironment(defaultHwInfo.get())); + MockContext context{}; + MockCommandQueue cmdQ(&context, mockDevice.get(), 0, false); + EXPECT_EQ(context.getMultiRootDeviceTimestampPacketAllocator(), nullptr); + MockEvent event{&cmdQ, CL_COMMAND_MARKER, 0, 0}; + event.getMultiRootTimestampSyncNode(); + EXPECT_NE(context.getMultiRootDeviceTimestampPacketAllocator(), nullptr); +} +TEST(MultiRootEvent, givenEventWithTagWhenEventGetsNewTagThenNewTagContainerIsNotCreated) { + auto mockDevice = std::make_unique(MockDevice::createWithNewExecutionEnvironment(defaultHwInfo.get())); + MockContext context{}; + MockCommandQueue cmdQ(&context, mockDevice.get(), 0, false); + MockEvent event{&cmdQ, CL_COMMAND_MARKER, 0, 0}; + EXPECT_EQ(event.getMultiRootDeviceTimestampPacketNodes(), nullptr); + event.getMultiRootTimestampSyncNode(); + auto containerPtr = event.getMultiRootDeviceTimestampPacketNodes(); + EXPECT_NE(containerPtr, nullptr); + event.getMultiRootTimestampSyncNode(); + EXPECT_EQ(containerPtr, event.getMultiRootDeviceTimestampPacketNodes()); +} \ No newline at end of file diff --git a/opencl/test/unit_test/gtpin/gtpin_tests.cpp b/opencl/test/unit_test/gtpin/gtpin_tests.cpp index 98e8f404d4..df7c3da23a 100644 --- a/opencl/test/unit_test/gtpin/gtpin_tests.cpp +++ b/opencl/test/unit_test/gtpin/gtpin_tests.cpp @@ -2435,7 +2435,7 @@ HWTEST_F(GTPinTests, givenGtPinInitializedWhenSubmittingKernelCommandThenFlushed gtpinNotifyKernelSubmit(kernel.mockMultiDeviceKernel, mockCmdQ.get()); - std::unique_ptr command(new CommandComputeKernel(*mockCmdQ, kernelOperation, surfaces, flushDC, slmUsed, ndRangeKernel, nullptr, preemptionMode, kernel, 1)); + std::unique_ptr command(new CommandComputeKernel(*mockCmdQ, kernelOperation, surfaces, flushDC, slmUsed, ndRangeKernel, nullptr, preemptionMode, kernel, 1, nullptr)); CompletionStamp stamp = command->submit(20, false); ASSERT_EQ(1u, kernelExecQueue.size()); diff --git a/opencl/test/unit_test/helpers/task_information_tests.cpp b/opencl/test/unit_test/helpers/task_information_tests.cpp index ead89cf244..302f090012 100644 --- a/opencl/test/unit_test/helpers/task_information_tests.cpp +++ b/opencl/test/unit_test/helpers/task_information_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2022 Intel Corporation + * Copyright (C) 2018-2023 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -154,7 +154,7 @@ TEST(CommandTest, givenWaitlistRequestWhenCommandComputeKernelIsCreatedThenMakeL public: using CommandComputeKernel::eventsWaitlist; MockCommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr &kernelOperation, std::vector &surfaces, Kernel *kernel) - : CommandComputeKernel(commandQueue, kernelOperation, surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0) {} + : CommandComputeKernel(commandQueue, kernelOperation, surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0, nullptr) {} }; auto device = std::make_unique(MockDevice::createWithNewExecutionEnvironment(defaultHwInfo.get())); @@ -291,7 +291,7 @@ HWTEST_F(DispatchFlagsTests, givenCommandComputeKernelWhenSubmitThenPassCorrectD for (auto &surface : surfaces) { requiresCoherency |= surface->IsCoherent; } - std::unique_ptr command(new CommandComputeKernel(*mockCmdQ, kernelOperation, surfaces, flushDC, slmUsed, ndRangeKernel, nullptr, preemptionMode, kernel, 1)); + std::unique_ptr command(new CommandComputeKernel(*mockCmdQ, kernelOperation, surfaces, flushDC, slmUsed, ndRangeKernel, nullptr, preemptionMode, kernel, 1, nullptr)); command->submit(20, false); EXPECT_FALSE(mockCsr->passedDispatchFlags.pipelineSelectArgs.systolicPipelineSelectMode); @@ -339,7 +339,7 @@ HWTEST_F(DispatchFlagsTests, givenClCommandCopyImageWhenSubmitThenFlushTextureCa for (auto &surface : surfaces) { requiresCoherency |= surface->IsCoherent; } - std::unique_ptr command(new CommandComputeKernel(*mockCmdQ, kernelOperation, surfaces, flushDC, slmUsed, commandType, nullptr, preemptionMode, kernel, 1)); + std::unique_ptr command(new CommandComputeKernel(*mockCmdQ, kernelOperation, surfaces, flushDC, slmUsed, commandType, nullptr, preemptionMode, kernel, 1, nullptr)); command->submit(20, false); EXPECT_FALSE(mockCsr->passedDispatchFlags.pipelineSelectArgs.systolicPipelineSelectMode); @@ -425,7 +425,7 @@ HWTEST_F(DispatchFlagsTests, givenCommandComputeKernelWhenSubmitThenPassCorrectD bool flushDC = false; bool slmUsed = false; bool ndRangeKernel = false; - std::unique_ptr command(new CommandComputeKernel(*mockCmdQ, kernelOperation, surfaces, flushDC, slmUsed, ndRangeKernel, nullptr, preemptionMode, kernel, 1)); + std::unique_ptr command(new CommandComputeKernel(*mockCmdQ, kernelOperation, surfaces, flushDC, slmUsed, ndRangeKernel, nullptr, preemptionMode, kernel, 1, nullptr)); command->submit(20, false); EXPECT_TRUE(mockCsr->passedDispatchFlags.epilogueRequired); diff --git a/opencl/test/unit_test/helpers/timestamp_packet_1_tests.cpp b/opencl/test/unit_test/helpers/timestamp_packet_1_tests.cpp index 6f2b2526d3..5d1fdd51b3 100644 --- a/opencl/test/unit_test/helpers/timestamp_packet_1_tests.cpp +++ b/opencl/test/unit_test/helpers/timestamp_packet_1_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2022 Intel Corporation + * Copyright (C) 2018-2023 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -34,11 +34,11 @@ HWCMDTEST_F(IGFX_GEN8_CORE, TimestampPacketTests, givenTimestampPacketWriteEnabl auto mockCmdQHw = std::make_unique>(context, device.get(), nullptr); device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = false; - getCommandStream(*mockCmdQHw, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false); + getCommandStream(*mockCmdQHw, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false, nullptr); auto sizeWithDisabled = mockCmdQHw->requestedCmdStreamSize; device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; - getCommandStream(*mockCmdQHw, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false); + getCommandStream(*mockCmdQHw, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false, nullptr); auto sizeWithEnabled = mockCmdQHw->requestedCmdStreamSize; auto extendedSize = sizeWithDisabled + sizeof(typename FamilyType::PIPE_CONTROL); @@ -52,7 +52,7 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledAndOoqWhenEstimat device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = false; getCommandStream(*mockCmdQHw, CsrDependencies(), false, false, - false, multiDispatchInfo, nullptr, 0, false, false); + false, multiDispatchInfo, nullptr, 0, false, false, nullptr); auto sizeWithDisabled = mockCmdQHw->requestedCmdStreamSize; device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; @@ -82,7 +82,7 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledAndOoqWhenEstimat eventsRequest.fillCsrDependenciesForTimestampPacketContainer( csrDeps, device->getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr); - getCommandStream(*mockCmdQHw, csrDeps, false, false, false, multiDispatchInfo, nullptr, 0, false, false); + getCommandStream(*mockCmdQHw, csrDeps, false, false, false, multiDispatchInfo, nullptr, 0, false, false, nullptr); auto sizeWithEnabled = mockCmdQHw->requestedCmdStreamSize; size_t sizeForNodeDependency = 0; @@ -143,7 +143,7 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledWhenEstimatingStr auto mockCmdQHw = std::make_unique>(context, device.get(), nullptr); device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = false; - getCommandStream(*mockCmdQHw, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false); + getCommandStream(*mockCmdQHw, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false, nullptr); auto sizeWithDisabled = mockCmdQHw->requestedCmdStreamSize; device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; @@ -172,7 +172,7 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledWhenEstimatingStr CsrDependencies csrDeps; eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, device->getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr); - getCommandStream(*mockCmdQHw, csrDeps, false, false, false, multiDispatchInfo, nullptr, 0, false, false); + getCommandStream(*mockCmdQHw, csrDeps, false, false, false, multiDispatchInfo, nullptr, 0, false, false, nullptr); auto sizeWithEnabled = mockCmdQHw->requestedCmdStreamSize; size_t sizeForNodeDependency = 0; diff --git a/opencl/test/unit_test/kernel/kernel_cache_flush_requirements_tests.cpp b/opencl/test/unit_test/kernel/kernel_cache_flush_requirements_tests.cpp index 7f5bffca9e..cef27f5bdc 100644 --- a/opencl/test/unit_test/kernel/kernel_cache_flush_requirements_tests.cpp +++ b/opencl/test/unit_test/kernel/kernel_cache_flush_requirements_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2019-2022 Intel Corporation + * Copyright (C) 2019-2023 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -212,7 +212,7 @@ HWTEST2_F(KernelWithCacheFlushTests, givenCacheFlushRequiredWhenEstimatingThenAd { EXPECT_FALSE(mockKernel->mockKernel->Kernel::requiresCacheFlushCommand(*cmdQ)); - initialSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, csrDeps, false, false, false, *cmdQ, multiDispatchInfo, false, false); + initialSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, csrDeps, false, false, false, *cmdQ, multiDispatchInfo, false, false, nullptr); } { @@ -226,7 +226,7 @@ HWTEST2_F(KernelWithCacheFlushTests, givenCacheFlushRequiredWhenEstimatingThenAd ultCsr.multiOsContextCapable = false; EXPECT_TRUE(mockKernel->mockKernel->Kernel::requiresCacheFlushCommand(*cmdQ)); - sizeWithCacheFlush = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, csrDeps, false, false, false, *cmdQ, multiDispatchInfo, false, false); + sizeWithCacheFlush = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, csrDeps, false, false, false, *cmdQ, multiDispatchInfo, false, false, nullptr); } EXPECT_EQ(initialSize + expectedDiff, sizeWithCacheFlush); diff --git a/opencl/test/unit_test/kernel/kernel_tests.cpp b/opencl/test/unit_test/kernel/kernel_tests.cpp index 4d972e4d06..8c6b3d2b45 100644 --- a/opencl/test/unit_test/kernel/kernel_tests.cpp +++ b/opencl/test/unit_test/kernel/kernel_tests.cpp @@ -504,6 +504,7 @@ class CommandStreamReceiverMock : public CommandStreamReceiver { using BaseClass::CommandStreamReceiver; TagAllocatorBase *getTimestampPacketAllocator() override { return nullptr; } + std::unique_ptr createMultiRootDeviceTimestampPacketAllocator(const RootDeviceIndicesContainer rootDeviceIndices) override { return std::unique_ptr(nullptr); } SubmissionStatus flushTagUpdate() override { return SubmissionStatus::SUCCESS; }; void updateTagFromWait() override{}; diff --git a/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp b/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp index 7a9210ab85..82f13e4521 100644 --- a/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp +++ b/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp @@ -766,11 +766,11 @@ HWTEST_TEMPLATED_F(BcsBufferTests, givenBufferOperationWithoutKernelWhenEstimati auto &hwInfo = cmdQ->getDevice().getHardwareInfo(); auto readBufferCmdsSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_READ_BUFFER, csrDependencies, false, false, - true, *cmdQ, multiDispatchInfo, false, false); + true, *cmdQ, multiDispatchInfo, false, false, nullptr); auto writeBufferCmdsSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_WRITE_BUFFER, csrDependencies, false, false, - true, *cmdQ, multiDispatchInfo, false, false); + true, *cmdQ, multiDispatchInfo, false, false, nullptr); auto copyBufferCmdsSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_COPY_BUFFER, csrDependencies, false, false, - true, *cmdQ, multiDispatchInfo, false, false); + true, *cmdQ, multiDispatchInfo, false, false, nullptr); auto expectedSize = TimestampPacketHelper::getRequiredCmdStreamSizeForNodeDependencyWithBlitEnqueue(); if (cmdQ->isCacheFlushForBcsRequired()) { diff --git a/opencl/test/unit_test/mocks/mock_command_queue.h b/opencl/test/unit_test/mocks/mock_command_queue.h index 553b0f9397..6a501e2546 100644 --- a/opencl/test/unit_test/mocks/mock_command_queue.h +++ b/opencl/test/unit_test/mocks/mock_command_queue.h @@ -249,6 +249,7 @@ class MockCommandQueueHw : public CommandQueueHw { using BaseClass::latestSentEnqueueType; using BaseClass::obtainCommandStream; using BaseClass::obtainNewTimestampPacketNodes; + using BaseClass::processDispatchForKernels; using BaseClass::requiresCacheFlushAfterWalker; using BaseClass::throttle; using BaseClass::timestampPacketContainer; diff --git a/opencl/test/unit_test/mocks/mock_event.h b/opencl/test/unit_test/mocks/mock_event.h index 376e638f39..6636dd9362 100644 --- a/opencl/test/unit_test/mocks/mock_event.h +++ b/opencl/test/unit_test/mocks/mock_event.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2022 Intel Corporation + * Copyright (C) 2018-2023 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -39,6 +39,7 @@ struct MockEvent : public BaseEventType { using Event::calculateSubmitTimestampData; using Event::isWaitForTimestampsEnabled; using Event::magic; + using Event::multiRootDeviceTimestampPacketContainer; using Event::queueTimeStamp; using Event::submitTimeStamp; using Event::timestampPacketContainer; diff --git a/opencl/test/unit_test/profiling/profiling_tests.cpp b/opencl/test/unit_test/profiling/profiling_tests.cpp index d456c5792d..7dec14a546 100644 --- a/opencl/test/unit_test/profiling/profiling_tests.cpp +++ b/opencl/test/unit_test/profiling/profiling_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2022 Intel Corporation + * Copyright (C) 2018-2023 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -71,13 +71,13 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GivenCommandQueueWithProfilingAndFor MultiDispatchInfo multiDispatchInfo(&kernel); auto &commandStreamNDRangeKernel = getCommandStream(*pCmdQ, CsrDependencies(), true, false, false, - multiDispatchInfo, nullptr, 0, false, false); + multiDispatchInfo, nullptr, 0, false, false, nullptr); auto expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, true, false, *pCmdQ, &kernel, {}); EXPECT_GE(expectedSizeCS, requiredSize); EXPECT_GE(commandStreamNDRangeKernel.getAvailableSpace(), requiredSize); auto &commandStreamTask = getCommandStream(*pCmdQ, CsrDependencies(), true, false, false, - multiDispatchInfo, nullptr, 0, false, false); + multiDispatchInfo, nullptr, 0, false, false, nullptr); expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_TASK, true, false, *pCmdQ, &kernel, {}); EXPECT_GE(expectedSizeCS, requiredSize); EXPECT_GE(commandStreamTask.getAvailableSpace(), requiredSize); @@ -93,13 +93,13 @@ HWTEST_F(ProfilingTests, GivenCommandQueueWithProfilingAndForWorkloadWithNoKerne MultiDispatchInfo multiDispatchInfo(nullptr); auto &commandStreamMigrateMemObjects = getCommandStream(*pCmdQ, CsrDependencies(), true, false, false, - multiDispatchInfo, nullptr, 0, false, false); + multiDispatchInfo, nullptr, 0, false, false, nullptr); auto expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_MIGRATE_MEM_OBJECTS, true, false, *pCmdQ, nullptr, {}); EXPECT_GE(expectedSizeCS, requiredSize); EXPECT_GE(commandStreamMigrateMemObjects.getAvailableSpace(), requiredSize); auto &commandStreamMarker = getCommandStream(*pCmdQ, CsrDependencies(), true, - false, false, multiDispatchInfo, nullptr, 0, false, false); + false, false, multiDispatchInfo, nullptr, 0, false, false, nullptr); expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_MARKER, true, false, *pCmdQ, nullptr, {}); EXPECT_GE(expectedSizeCS, requiredSize); EXPECT_GE(commandStreamMarker.getAvailableSpace(), requiredSize); @@ -121,9 +121,9 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GivenCommandQueueWithProfilingAndFor multiDispatchInfo.push(dispatchInfo); multiDispatchInfo.push(dispatchInfo); auto &commandStreamTask = getCommandStream(*pCmdQ, CsrDependencies(), true, false, false, - multiDispatchInfo, nullptr, 0, false, false); + multiDispatchInfo, nullptr, 0, false, false, nullptr); auto expectedSizeCS = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_TASK, CsrDependencies(), true, false, - false, *pCmdQ, multiDispatchInfo, false, false); + false, *pCmdQ, multiDispatchInfo, false, false, nullptr); EXPECT_GE(expectedSizeCS, requiredSize); EXPECT_GE(commandStreamTask.getAvailableSpace(), requiredSize); } @@ -741,13 +741,13 @@ HWTEST_F(ProfilingWithPerfCountersTests, GivenCommandQueueWithProfilingPerfCount MultiDispatchInfo multiDispatchInfo(nullptr); auto &commandStreamMigrateMemObjects = getCommandStream(*pCmdQ, CsrDependencies(), true, true, false, multiDispatchInfo, - nullptr, 0, false, false); + nullptr, 0, false, false, nullptr); auto expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_MIGRATE_MEM_OBJECTS, true, true, *pCmdQ, nullptr, {}); EXPECT_GE(expectedSizeCS, requiredSize); EXPECT_GE(commandStreamMigrateMemObjects.getAvailableSpace(), requiredSize); auto &commandStreamMarker = getCommandStream(*pCmdQ, CsrDependencies(), true, true, false, - multiDispatchInfo, nullptr, 0, false, false); + multiDispatchInfo, nullptr, 0, false, false, nullptr); expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_MARKER, true, true, *pCmdQ, nullptr, {}); EXPECT_GE(expectedSizeCS, requiredSize); EXPECT_GE(commandStreamMarker.getAvailableSpace(), requiredSize); diff --git a/opencl/test/unit_test/xe_hpc_core/pvc/command_stream_receiver_hw_tests_pvc.cpp b/opencl/test/unit_test/xe_hpc_core/pvc/command_stream_receiver_hw_tests_pvc.cpp index 4d23062f48..1d71c71ad1 100644 --- a/opencl/test/unit_test/xe_hpc_core/pvc/command_stream_receiver_hw_tests_pvc.cpp +++ b/opencl/test/unit_test/xe_hpc_core/pvc/command_stream_receiver_hw_tests_pvc.cpp @@ -243,10 +243,14 @@ PVCTEST_F(PvcMultiRootDeviceCommandStreamReceiverBufferTests, givenMultipleEvent MockGraphicsAllocation svmAlloc(svmPtr, svmSize); Event event1(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 5, 15); + auto node1 = event1.getMultiRootTimestampSyncNode(); Event event2(nullptr, CL_COMMAND_NDRANGE_KERNEL, 6, 16); Event event3(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 4, 20); + auto node3 = event3.getMultiRootTimestampSyncNode(); Event event4(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 3, 4); + auto node4 = event4.getMultiRootTimestampSyncNode(); Event event5(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 2, 7); + auto node5 = event5.getMultiRootTimestampSyncNode(); UserEvent userEvent1(&pCmdQ1->getContext()); UserEvent userEvent2(&pCmdQ2->getContext()); @@ -285,12 +289,12 @@ PVCTEST_F(PvcMultiRootDeviceCommandStreamReceiverBufferTests, givenMultipleEvent EXPECT_EQ(3u, semaphores.size()); auto semaphoreCmd0 = genCmdCast(*(semaphores[0])); - EXPECT_EQ(4u, semaphoreCmd0->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ2->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd0->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(node4->getContextEndAddress(0u)), semaphoreCmd0->getSemaphoreGraphicsAddress()); auto semaphoreCmd1 = genCmdCast(*(semaphores[1])); - EXPECT_EQ(7u, semaphoreCmd1->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ2->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd1->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd1->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(node5->getContextEndAddress(0u)), semaphoreCmd1->getSemaphoreGraphicsAddress()); } { @@ -313,12 +317,12 @@ PVCTEST_F(PvcMultiRootDeviceCommandStreamReceiverBufferTests, givenMultipleEvent EXPECT_EQ(3u, semaphores.size()); auto semaphoreCmd0 = genCmdCast(*(semaphores[0])); - EXPECT_EQ(15u, semaphoreCmd0->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd0->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(node1->getContextEndAddress(0u)), semaphoreCmd0->getSemaphoreGraphicsAddress()); auto semaphoreCmd1 = genCmdCast(*(semaphores[1])); - EXPECT_EQ(20u, semaphoreCmd1->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd1->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd1->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(node3->getContextEndAddress(0u)), semaphoreCmd1->getSemaphoreGraphicsAddress()); } alignedFree(svmPtr); } diff --git a/shared/source/command_stream/command_stream_receiver.h b/shared/source/command_stream/command_stream_receiver.h index f251a52b0a..b5aa2b9bd1 100644 --- a/shared/source/command_stream/command_stream_receiver.h +++ b/shared/source/command_stream/command_stream_receiver.h @@ -221,6 +221,7 @@ class CommandStreamReceiver { TagAllocatorBase *getEventTsAllocator(); TagAllocatorBase *getEventPerfCountAllocator(const uint32_t tagSize); virtual TagAllocatorBase *getTimestampPacketAllocator() = 0; + virtual std::unique_ptr createMultiRootDeviceTimestampPacketAllocator(const RootDeviceIndicesContainer rootDeviceIndices) = 0; virtual bool expectMemory(const void *gfxAddress, const void *srcAddress, size_t length, uint32_t compareOperation); diff --git a/shared/source/command_stream/command_stream_receiver_hw.h b/shared/source/command_stream/command_stream_receiver_hw.h index 9da3c83fb2..40699214f6 100644 --- a/shared/source/command_stream/command_stream_receiver_hw.h +++ b/shared/source/command_stream/command_stream_receiver_hw.h @@ -130,6 +130,7 @@ class CommandStreamReceiverHw : public CommandStreamReceiver { GraphicsAllocation *getClearColorAllocation() override; TagAllocatorBase *getTimestampPacketAllocator() override; + std::unique_ptr createMultiRootDeviceTimestampPacketAllocator(const RootDeviceIndicesContainer rootDeviceIndices) override; void postInitFlagsSetup() override; void programActivePartitionConfig(LinearStream &csr); diff --git a/shared/source/command_stream/command_stream_receiver_hw_base.inl b/shared/source/command_stream/command_stream_receiver_hw_base.inl index eddb66ec7e..161228b435 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_base.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_base.inl @@ -406,7 +406,7 @@ CompletionStamp CommandStreamReceiverHw::flushTask( auto commandStreamStartCSR = commandStreamCSR.getUsed(); TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer(commandStreamCSR, dispatchFlags.csrDependencies); - TimestampPacketHelper::programCsrDependenciesForForTaskCountContainer(commandStreamCSR, dispatchFlags.csrDependencies); + TimestampPacketHelper::programCsrDependenciesForForMultiRootDeviceSyncContainer(commandStreamCSR, dispatchFlags.csrDependencies); programActivePartitionConfigFlushTask(commandStreamCSR); programEngineModeCommands(commandStreamCSR, dispatchFlags); @@ -980,7 +980,7 @@ size_t CommandStreamReceiverHw::getRequiredCmdStreamSize(const Dispat } size += TimestampPacketHelper::getRequiredCmdStreamSize(dispatchFlags.csrDependencies); - size += TimestampPacketHelper::getRequiredCmdStreamSizeForTaskCountContainer(dispatchFlags.csrDependencies); + size += TimestampPacketHelper::getRequiredCmdStreamSizeForMultiRootDeviceSyncNodesContainer(dispatchFlags.csrDependencies); size += EncodeKernelArgsBuffer::getKernelArgsBufferCmdsSize(kernelArgsBufferAllocation, logicalStateHelper.get()); @@ -1196,7 +1196,7 @@ TaskCountType CommandStreamReceiverHw::flushBcsTask(const BlitPropert for (auto &blitProperties : blitPropertiesContainer) { TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer(commandStream, blitProperties.csrDependencies); - TimestampPacketHelper::programCsrDependenciesForForTaskCountContainer(commandStream, blitProperties.csrDependencies); + TimestampPacketHelper::programCsrDependenciesForForMultiRootDeviceSyncContainer(commandStream, blitProperties.csrDependencies); BlitCommandsHelper::encodeWa(commandStream, blitProperties, latestSentBcsWaValue); @@ -1229,6 +1229,12 @@ TaskCountType CommandStreamReceiverHw::flushBcsTask(const BlitPropert if (blitProperties.clearColorAllocation) { makeResident(*blitProperties.clearColorAllocation); } + if (blitProperties.multiRootDeviceEventSync != nullptr) { + MiFlushArgs args; + args.commandWithPostSync = true; + args.notifyEnable = isUsedNotifyEnableForPostSync(); + EncodeMiFlushDW::programMiFlushDw(commandStream, blitProperties.multiRootDeviceEventSync->getGpuAddress() + blitProperties.multiRootDeviceEventSync->getContextEndOffset(), std::numeric_limits::max(), args, hwInfo); + } } BlitCommandsHelper::programGlobalSequencerFlush(commandStream); @@ -1245,7 +1251,6 @@ TaskCountType CommandStreamReceiverHw::flushBcsTask(const BlitPropert MemorySynchronizationCommands::addAdditionalSynchronization(commandStream, tagAllocation->getGpuAddress(), false, peekHwInfo()); } - if (PauseOnGpuProperties::pauseModeAllowed(DebugManager.flags.PauseOnBlitCopy.get(), taskCount, PauseOnGpuProperties::PauseMode::AfterWorkload)) { BlitCommandsHelper::dispatchDebugPauseCommands(commandStream, getDebugPauseStateGPUAddress(), DebugPauseState::waitingForUserEndConfirmation, @@ -1522,6 +1527,11 @@ TagAllocatorBase *CommandStreamReceiverHw::getTimestampPacketAllocato return timestampPacketAllocator.get(); } +template +std::unique_ptr CommandStreamReceiverHw::createMultiRootDeviceTimestampPacketAllocator(const RootDeviceIndicesContainer rootDeviceIndices) { + auto &gfxCoreHelper = getGfxCoreHelper(); + return gfxCoreHelper.createTimestampPacketAllocator(rootDeviceIndices, getMemoryManager(), getPreferredTagPoolSize(), getType(), osContext->getDeviceBitfield()); +} template void CommandStreamReceiverHw::postInitFlagsSetup() { useNewResourceImplicitFlush = checkPlatformSupportsNewResourceImplicitFlush(); diff --git a/shared/source/command_stream/csr_deps.h b/shared/source/command_stream/csr_deps.h index a601179e88..1e1b8d0130 100644 --- a/shared/source/command_stream/csr_deps.h +++ b/shared/source/command_stream/csr_deps.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2022 Intel Corporation + * Copyright (C) 2020-2023 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -22,7 +22,7 @@ class CsrDependencies { All }; - StackVec, 32> taskCountContainer; + StackVec multiRootTimeStampSyncContainer; StackVec timestampPacketContainer; void makeResident(CommandStreamReceiver &commandStreamReceiver) const; diff --git a/shared/source/helpers/blit_commands_helper.cpp b/shared/source/helpers/blit_commands_helper.cpp index 57cb67c5de..67fa909e65 100644 --- a/shared/source/helpers/blit_commands_helper.cpp +++ b/shared/source/helpers/blit_commands_helper.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2019-2022 Intel Corporation + * Copyright (C) 2019-2023 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -50,6 +50,7 @@ BlitProperties BlitProperties::constructPropertiesForReadWrite(BlitterConstants: BlitterConstants::BlitDirection::HostPtrToImage == blitDirection) { return { nullptr, // outputTimestampPacket + nullptr, // multiRootDeviceEventSync blitDirection, // blitDirection {}, // csrDependencies AuxTranslationDirection::None, // auxTranslationDirection @@ -73,6 +74,7 @@ BlitProperties BlitProperties::constructPropertiesForReadWrite(BlitterConstants: } else { return { nullptr, // outputTimestampPacket + nullptr, // multiRootDeviceEventSync blitDirection, // blitDirection {}, // csrDependencies AuxTranslationDirection::None, // auxTranslationDirection @@ -104,6 +106,7 @@ BlitProperties BlitProperties::constructPropertiesForCopy(GraphicsAllocation *ds return { nullptr, // outputTimestampPacket + nullptr, // multiRootDeviceEventSync BlitterConstants::BlitDirection::BufferToBuffer, // blitDirection {}, // csrDependencies AuxTranslationDirection::None, // auxTranslationDirection @@ -128,6 +131,7 @@ BlitProperties BlitProperties::constructPropertiesForAuxTranslation(AuxTranslati auto allocationSize = allocation->getUnderlyingBufferSize(); return { nullptr, // outputTimestampPacket + nullptr, // multiRootDeviceEventSync BlitterConstants::BlitDirection::BufferToBuffer, // blitDirection {}, // csrDependencies auxTranslationDirection, // auxTranslationDirection diff --git a/shared/source/helpers/blit_commands_helper.h b/shared/source/helpers/blit_commands_helper.h index f5f4d345bd..8302f2bc08 100644 --- a/shared/source/helpers/blit_commands_helper.h +++ b/shared/source/helpers/blit_commands_helper.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2019-2022 Intel Corporation + * Copyright (C) 2019-2023 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -61,6 +61,7 @@ struct BlitProperties { CommandStreamReceiver &gpguCsr, CommandStreamReceiver &bcsCsr); TagNodeBase *outputTimestampPacket = nullptr; + TagNodeBase *multiRootDeviceEventSync = nullptr; BlitterConstants::BlitDirection blitDirection = BlitterConstants::BlitDirection::BufferToHostPtr; CsrDependencies csrDependencies; AuxTranslationDirection auxTranslationDirection = AuxTranslationDirection::None; diff --git a/shared/source/helpers/blit_commands_helper_base.inl b/shared/source/helpers/blit_commands_helper_base.inl index bef912ec91..d310dcc425 100644 --- a/shared/source/helpers/blit_commands_helper_base.inl +++ b/shared/source/helpers/blit_commands_helper_base.inl @@ -127,7 +127,7 @@ size_t BlitCommandsHelper::estimateBlitCommandSize(const Vec3 sizePerBlit += estimatePostBlitCommandSize(); return TimestampPacketHelper::getRequiredCmdStreamSize(csrDependencies) + - TimestampPacketHelper::getRequiredCmdStreamSizeForTaskCountContainer(csrDependencies) + + TimestampPacketHelper::getRequiredCmdStreamSizeForMultiRootDeviceSyncNodesContainer(csrDependencies) + (sizePerBlit * nBlits) + timestampCmdSize + estimatePreBlitCommandSize(); @@ -143,6 +143,9 @@ size_t BlitCommandsHelper::estimateBlitCommandsSize(const BlitPropert auto isImage = blitProperties.isImageOperation(); size += BlitCommandsHelper::estimateBlitCommandSize(blitProperties.copySize, blitProperties.csrDependencies, updateTimestampPacket, profilingEnabled, isImage, rootDeviceEnvironment, blitProperties.isSystemMemoryPoolUsed); + if (blitProperties.multiRootDeviceEventSync != nullptr) { + size += EncodeMiFlushDW::getMiFlushDwCmdSizeForDataWrite(); + } } size += BlitCommandsHelper::getWaCmdsSize(blitPropertiesContainer); size += 2 * MemorySynchronizationCommands::getSizeForAdditonalSynchronization(*rootDeviceEnvironment.getHardwareInfo()); diff --git a/shared/source/helpers/timestamp_packet.h b/shared/source/helpers/timestamp_packet.h index a3b67f50c0..da7e68d65e 100644 --- a/shared/source/helpers/timestamp_packet.h +++ b/shared/source/helpers/timestamp_packet.h @@ -144,17 +144,11 @@ struct TimestampPacketHelper { } template - static void programCsrDependenciesForForTaskCountContainer(LinearStream &cmdStream, const CsrDependencies &csrDependencies) { - auto &taskCountContainer = csrDependencies.taskCountContainer; - - for (auto &[taskCountPreviousRootDevice, tagAddressPreviousRootDevice] : taskCountContainer) { - using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION; - using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT; - - EncodeSempahore::addMiSemaphoreWaitCommand(cmdStream, - static_cast(tagAddressPreviousRootDevice), - static_cast(taskCountPreviousRootDevice), - COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD); + static void programCsrDependenciesForForMultiRootDeviceSyncContainer(LinearStream &cmdStream, const CsrDependencies &csrDependencies) { + for (auto timestampPacketContainer : csrDependencies.multiRootTimeStampSyncContainer) { + for (auto &node : timestampPacketContainer->peekNodes()) { + TimestampPacketHelper::programSemaphore(cmdStream, *node); + } } } @@ -217,8 +211,8 @@ struct TimestampPacketHelper { } template - static size_t getRequiredCmdStreamSizeForTaskCountContainer(const CsrDependencies &csrDependencies) { - return csrDependencies.taskCountContainer.size() * sizeof(typename GfxFamily::MI_SEMAPHORE_WAIT); + static size_t getRequiredCmdStreamSizeForMultiRootDeviceSyncNodesContainer(const CsrDependencies &csrDependencies) { + return csrDependencies.multiRootTimeStampSyncContainer.size() * sizeof(typename GfxFamily::MI_SEMAPHORE_WAIT); } }; diff --git a/shared/test/common/mocks/mock_command_stream_receiver.h b/shared/test/common/mocks/mock_command_stream_receiver.h index b413a2140f..40564db786 100644 --- a/shared/test/common/mocks/mock_command_stream_receiver.h +++ b/shared/test/common/mocks/mock_command_stream_receiver.h @@ -18,6 +18,7 @@ #include "shared/source/memory_manager/graphics_allocation.h" #include "shared/source/memory_manager/surface.h" #include "shared/source/os_interface/os_context.h" +#include "shared/source/utilities/tag_allocator.h" #include "shared/test/common/helpers/dispatch_flags_helper.h" #include @@ -94,6 +95,7 @@ class MockCommandStreamReceiver : public CommandStreamReceiver { }; TagAllocatorBase *getTimestampPacketAllocator() override { return nullptr; } + std::unique_ptr createMultiRootDeviceTimestampPacketAllocator(const RootDeviceIndicesContainer rootDeviceIndices) override { return std::unique_ptr(nullptr); } CompletionStamp flushTask( LinearStream &commandStream, diff --git a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp index 777ba25566..2b5d0f0e21 100644 --- a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp +++ b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp @@ -15,10 +15,13 @@ #include "shared/source/helpers/api_specific_config.h" #include "shared/source/memory_manager/internal_allocation_storage.h" #include "shared/source/memory_manager/surface.h" +#include "shared/source/memory_manager/unified_memory_manager.h" #include "shared/source/os_interface/device_factory.h" #include "shared/source/os_interface/hw_info_config.h" #include "shared/source/os_interface/os_interface.h" #include "shared/source/utilities/tag_allocator.h" +#include "shared/test/common/cmd_parse/gen_cmd_parse.h" +#include "shared/test/common/cmd_parse/hw_parse.h" #include "shared/test/common/fixtures/command_stream_receiver_fixture.inl" #include "shared/test/common/fixtures/device_fixture.h" #include "shared/test/common/helpers/batch_buffer_helper.h" @@ -32,6 +35,7 @@ #include "shared/test/common/mocks/mock_execution_environment.h" #include "shared/test/common/mocks/mock_internal_allocation_storage.h" #include "shared/test/common/mocks/mock_memory_manager.h" +#include "shared/test/common/mocks/mock_timestamp_container.h" #include "shared/test/common/mocks/ult_device_factory.h" #include "shared/test/common/test_macros/hw_test.h" #include "shared/test/common/test_macros/test_checks_shared.h" @@ -2463,3 +2467,89 @@ HWTEST_F(CommandStreamReceiverHwTest, givenVariousCsrModeWhenGettingTbxModeThenE ultCsr.commandStreamReceiverType = CommandStreamReceiverType::CSR_TBX_WITH_AUB; EXPECT_TRUE(ultCsr.isTbxMode()); } + +HWTEST_F(CommandStreamReceiverHwTest, GivenTwoRootDevicesWhengetMultiRootDeviceTimestampPacketAllocatorCalledThenAllocatorForTwoDevicesCreated) { + auto executionEnvironment = std::make_unique(defaultHwInfo.get(), true, 2u); + auto devices = DeviceFactory::createDevices(*executionEnvironment.release()); + const RootDeviceIndicesContainer indices = {0u, 1u}; + auto csr = devices[0]->getDefaultEngine().commandStreamReceiver; + auto allocator = csr->createMultiRootDeviceTimestampPacketAllocator(indices); + class MockTagAllocatorBase : public TagAllocatorBase { + public: + using TagAllocatorBase::maxRootDeviceIndex; + }; + EXPECT_EQ(reinterpret_cast(allocator.get())->maxRootDeviceIndex, 1u); +} +HWTEST_F(CommandStreamReceiverHwTest, GivenFiveRootDevicesWhengetMultiRootDeviceTimestampPacketAllocatorCalledThenAllocatorForFiveDevicesCreated) { + auto executionEnvironment = std::make_unique(defaultHwInfo.get(), true, 4u); + auto devices = DeviceFactory::createDevices(*executionEnvironment.release()); + const RootDeviceIndicesContainer indices = {0u, 1u, 2u, 3u}; + auto csr = devices[0]->getDefaultEngine().commandStreamReceiver; + auto allocator = csr->createMultiRootDeviceTimestampPacketAllocator(indices); + class MockTagAllocatorBase : public TagAllocatorBase { + public: + using TagAllocatorBase::maxRootDeviceIndex; + }; + EXPECT_EQ(reinterpret_cast(allocator.get())->maxRootDeviceIndex, 3u); +} +HWTEST_F(CommandStreamReceiverHwTest, givenMultiRootDeviceSyncNodeWhenFlushBcsTAskThenMiFlushAdded) { + using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW; + auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); + auto mockTagAllocator = std::make_unique>(pDevice->getRootDeviceIndex(), pDevice->getExecutionEnvironment()->memoryManager.get(), 10u); + + auto blitProperties = BlitProperties::constructPropertiesForReadWrite(BlitterConstants::BlitDirection::BufferToHostPtr, + commandStreamReceiver, commandStreamReceiver.getTagAllocation(), nullptr, + commandStreamReceiver.getTagAllocation()->getUnderlyingBuffer(), + commandStreamReceiver.getTagAllocation()->getGpuAddress(), 0, + 0, 0, 0, 0, 0, 0, 0); + auto tag = mockTagAllocator->getTag(); + blitProperties.multiRootDeviceEventSync = tag; + + BlitPropertiesContainer container; + container.push_back(blitProperties); + commandStreamReceiver.flushBcsTask(container, true, false, *pDevice); + HardwareParse hwParser; + hwParser.parseCommands(commandStreamReceiver.commandStream, 0); + + auto cmdIterator = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); + bool nodeAddressFound = false; + while (cmdIterator != hwParser.cmdList.end()) { + auto flush = genCmdCast(*cmdIterator); + if (flush->getDestinationAddress() == tag->getGpuAddress() + tag->getContextEndOffset()) { + nodeAddressFound = true; + break; + } + cmdIterator = find(++cmdIterator, hwParser.cmdList.end()); + } + EXPECT_TRUE(nodeAddressFound); +} +HWTEST_F(CommandStreamReceiverHwTest, givenNullPtrAsMultiRootDeviceSyncNodeWhenFlushBcsTAskThenMiFlushNotAdded) { + using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW; + auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); + auto mockTagAllocator = std::make_unique>(pDevice->getRootDeviceIndex(), pDevice->getExecutionEnvironment()->memoryManager.get(), 10u); + + auto blitProperties = BlitProperties::constructPropertiesForReadWrite(BlitterConstants::BlitDirection::BufferToHostPtr, + commandStreamReceiver, commandStreamReceiver.getTagAllocation(), nullptr, + commandStreamReceiver.getTagAllocation()->getUnderlyingBuffer(), + commandStreamReceiver.getTagAllocation()->getGpuAddress(), 0, + 0, 0, 0, 0, 0, 0, 0); + auto tag = mockTagAllocator->getTag(); + + BlitPropertiesContainer container; + container.push_back(blitProperties); + commandStreamReceiver.flushBcsTask(container, true, false, *pDevice); + HardwareParse hwParser; + hwParser.parseCommands(commandStreamReceiver.commandStream, 0); + + auto cmdIterator = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); + bool nodeAddressFound = false; + while (cmdIterator != hwParser.cmdList.end()) { + auto flush = genCmdCast(*cmdIterator); + if (flush->getDestinationAddress() == tag->getGpuAddress() + tag->getContextEndOffset()) { + nodeAddressFound = true; + break; + } + cmdIterator = find(++cmdIterator, hwParser.cmdList.end()); + } + EXPECT_FALSE(nodeAddressFound); +} \ No newline at end of file diff --git a/shared/test/unit_test/helpers/blit_commands_helper_tests.cpp b/shared/test/unit_test/helpers/blit_commands_helper_tests.cpp index fc4556ca11..7a42476b67 100644 --- a/shared/test/unit_test/helpers/blit_commands_helper_tests.cpp +++ b/shared/test/unit_test/helpers/blit_commands_helper_tests.cpp @@ -15,6 +15,7 @@ #include "shared/test/common/helpers/debug_manager_state_restore.h" #include "shared/test/common/helpers/default_hw_info.h" #include "shared/test/common/mocks/mock_graphics_allocation.h" +#include "shared/test/common/mocks/mock_timestamp_container.h" #include "shared/test/common/mocks/ult_device_factory.h" #include "shared/test/common/test_macros/test_checks_shared.h" @@ -663,3 +664,25 @@ HWTEST2_F(BlitTests, givenPlatformWhenCallingDispatchPreBlitCommandThenNoneMiFlu auto cmdIterator = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); ASSERT_EQ(hwParser.cmdList.end(), cmdIterator); } + +HWTEST_F(BlitTests, givenPlatformWhenCallingDispatchPreBlitCommandThenNoneMiFlushDwIsProgramed) { + auto mockTagAllocator = std::make_unique>(pDevice->getRootDeviceIndex(), pDevice->getExecutionEnvironment()->memoryManager.get(), 10u); + auto tag = mockTagAllocator->getTag(); + BlitProperties blitProperties{}; + blitProperties.copySize = {1, 1, 1}; + BlitPropertiesContainer blitPropertiesContainer1; + blitPropertiesContainer1.push_back(blitProperties); + blitPropertiesContainer1.push_back(blitProperties); + blitPropertiesContainer1.push_back(blitProperties); + + auto estimatedSizeWithoutNode = BlitCommandsHelper::estimateBlitCommandsSize( + blitPropertiesContainer1, false, true, false, pDevice->getRootDeviceEnvironment()); + blitProperties.multiRootDeviceEventSync = tag; + BlitPropertiesContainer blitPropertiesContainer2; + blitPropertiesContainer2.push_back(blitProperties); + blitPropertiesContainer2.push_back(blitProperties); + blitPropertiesContainer2.push_back(blitProperties); + auto estimatedSizeWithNode = BlitCommandsHelper::estimateBlitCommandsSize( + blitPropertiesContainer2, false, true, false, pDevice->getRootDeviceEnvironment()); + EXPECT_NE(estimatedSizeWithoutNode, estimatedSizeWithNode); +} \ No newline at end of file diff --git a/shared/test/unit_test/helpers/timestamp_packet_tests.cpp b/shared/test/unit_test/helpers/timestamp_packet_tests.cpp index 7ee5d975e3..8f040153f2 100644 --- a/shared/test/unit_test/helpers/timestamp_packet_tests.cpp +++ b/shared/test/unit_test/helpers/timestamp_packet_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2022 Intel Corporation + * Copyright (C) 2022-2023 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -301,3 +301,35 @@ HWTEST_F(DeviceTimestampPacketTests, givenDebugFlagSetWhenCreatingTimestampPacke EXPECT_FALSE(tag->canBeReleased()); } + +using TimestampPacketHelperTests = Test; + +HWTEST_F(TimestampPacketHelperTests, givenTagNodesInMultiRootSyncContainerWhenProgramingDependensiecThenSemaforesAreProgrammed) { + StackVec buffer(4096); + LinearStream cmdStream(buffer.begin(), buffer.size()); + CsrDependencies deps; + auto mockTagAllocator = std::make_unique>(0, pDevice->getMemoryManager()); + TimestampPacketContainer container = {}; + container.add(mockTagAllocator->getTag()); + deps.multiRootTimeStampSyncContainer.push_back(&container); + TimestampPacketHelper::programCsrDependenciesForForMultiRootDeviceSyncContainer(cmdStream, deps); + EXPECT_EQ(cmdStream.getUsed(), sizeof(typename FamilyType::MI_SEMAPHORE_WAIT)); +} + +HWTEST_F(TimestampPacketHelperTests, givenEmptyContainerMultiRootSyncContainerWhenProgramingDependensiecThenZeroSemaforesAreProgrammed) { + StackVec buffer(4096); + LinearStream cmdStream(buffer.begin(), buffer.size()); + CsrDependencies deps; + TimestampPacketContainer container = {}; + deps.multiRootTimeStampSyncContainer.push_back(&container); + TimestampPacketHelper::programCsrDependenciesForForMultiRootDeviceSyncContainer(cmdStream, deps); + EXPECT_EQ(cmdStream.getUsed(), 0u); +} + +HWTEST_F(TimestampPacketHelperTests, givenEmptyMultiRootSyncContainerWhenProgramingDependensiecThenZeroSemaforesAreProgrammed) { + StackVec buffer(4096); + LinearStream cmdStream(buffer.begin(), buffer.size()); + CsrDependencies deps; + TimestampPacketHelper::programCsrDependenciesForForMultiRootDeviceSyncContainer(cmdStream, deps); + EXPECT_EQ(cmdStream.getUsed(), 0u); +} \ No newline at end of file