diff --git a/opencl/source/command_queue/CMakeLists.txt b/opencl/source/command_queue/CMakeLists.txt index 5e782f5c18..a597140777 100644 --- a/opencl/source/command_queue/CMakeLists.txt +++ b/opencl/source/command_queue/CMakeLists.txt @@ -13,6 +13,7 @@ set(RUNTIME_SRCS_COMMAND_QUEUE ${CMAKE_CURRENT_SOURCE_DIR}/command_queue_hw.h ${CMAKE_CURRENT_SOURCE_DIR}/command_queue_hw_base.inl ${CMAKE_CURRENT_SOURCE_DIR}/command_queue_hw_bdw_and_later.inl + ${CMAKE_CURRENT_SOURCE_DIR}/command_queue_process_dispatch_for_kernels_instance.inl ${CMAKE_CURRENT_SOURCE_DIR}/copy_engine_state.h ${CMAKE_CURRENT_SOURCE_DIR}/cpu_data_transfer_handler.cpp ${CMAKE_CURRENT_SOURCE_DIR}/csr_selection_args.h diff --git a/opencl/source/command_queue/command_queue_hw.h b/opencl/source/command_queue/command_queue_hw.h index 00a3205603..4e532c1a02 100644 --- a/opencl/source/command_queue/command_queue_hw.h +++ b/opencl/source/command_queue/command_queue_hw.h @@ -388,7 +388,8 @@ class CommandQueueHw : public CommandQueue { EventsRequest &eventsRequest, EventBuilder &externalEventBuilder, std::unique_ptr &&printfHandler, - CommandStreamReceiver *bcsCsr); + CommandStreamReceiver *bcsCsr, + TagNodeBase *multiRootDeviceSyncNode); CompletionStamp enqueueCommandWithoutKernel(Surface **surfaces, size_t surfaceCount, @@ -419,7 +420,7 @@ class CommandQueueHw : public CommandQueue { TimestampPacketDependencies ×tampPacketDependencies, const EventsRequest &eventsRequest, LinearStream *commandStream, - uint32_t commandType, bool queueBlocked); + uint32_t commandType, bool queueBlocked, TagNodeBase *multiRootDeviceEventSync); void submitCacheFlush(Surface **surfaces, size_t numSurfaces, LinearStream *commandStream, @@ -470,7 +471,7 @@ class CommandQueueHw : public CommandQueue { blockedCommandsData = std::make_unique(commandStream, *gpgpuCsr.getInternalAllocationStorage()); } else { commandStream = &getCommandStream(*this, csrDependencies, profilingRequired, perfCountersRequired, - blitEnqueue, multiDispatchInfo, surfaces, numSurfaces, isMarkerWithProfiling, eventsRequest.numEventsInWaitList > 0); + blitEnqueue, multiDispatchInfo, surfaces, numSurfaces, isMarkerWithProfiling, eventsRequest.numEventsInWaitList > 0, eventsRequest.outEvent); } return commandStream; } diff --git a/opencl/source/command_queue/command_queue_hw_xehp_and_later.inl b/opencl/source/command_queue/command_queue_hw_xehp_and_later.inl index 33b2cadafa..c9dd2c5124 100644 --- a/opencl/source/command_queue/command_queue_hw_xehp_and_later.inl +++ b/opencl/source/command_queue/command_queue_hw_xehp_and_later.inl @@ -45,7 +45,7 @@ bool CommandQueueHw::isCacheFlushCommand(uint32_t commandType) const { } template <> -LinearStream &getCommandStream(CommandQueue &commandQueue, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, bool blitEnqueue, const MultiDispatchInfo &multiDispatchInfo, Surface **surfaces, size_t numSurfaces, bool isMarkerWithProfiling, bool eventsInWaitList) { +LinearStream &getCommandStream(CommandQueue &commandQueue, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, bool blitEnqueue, const MultiDispatchInfo &multiDispatchInfo, Surface **surfaces, size_t numSurfaces, bool isMarkerWithProfiling, bool eventsInWaitList, cl_event *outEvent) { size_t expectedSizeCS = 0; [[maybe_unused]] bool usePostSync = false; if (commandQueue.getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { diff --git a/opencl/source/command_queue/command_queue_process_dispatch_for_kernels_instance.inl b/opencl/source/command_queue/command_queue_process_dispatch_for_kernels_instance.inl new file mode 100644 index 0000000000..1d995e463e --- /dev/null +++ b/opencl/source/command_queue/command_queue_process_dispatch_for_kernels_instance.inl @@ -0,0 +1,16 @@ +/* + * Copyright (C) 2022 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#pragma once +template void CommandQueueHw::processDispatchForKernels(const MultiDispatchInfo &multiDispatchInfo, + std::unique_ptr &printfHandler, + Event *event, + TagNodeBase *&hwTimeStamps, + bool blockQueue, + CsrDependencies &csrDeps, + KernelOperation *blockedCommandsData, + TimestampPacketDependencies ×tampPacketDependencies); \ No newline at end of file diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h index 320aac9f09..3ceb9ba3c0 100644 --- a/opencl/source/command_queue/enqueue_common.h +++ b/opencl/source/command_queue/enqueue_common.h @@ -176,7 +176,7 @@ cl_int CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, BlitPropertiesContainer blitPropertiesContainer; if (this->context->getRootDeviceIndices().size() > 1) { - eventsRequest.fillCsrDependenciesForTaskCountContainer(csrDeps, computeCommandStreamReceiver); + eventsRequest.fillCsrDependenciesForRootDevices(csrDeps, computeCommandStreamReceiver); } const bool enqueueWithBlitAuxTranslation = isBlitAuxTranslationRequired(multiDispatchInfo); @@ -225,7 +225,7 @@ cl_int CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, } if (this->context->getRootDeviceIndices().size() > 1) { - TimestampPacketHelper::programCsrDependenciesForForTaskCountContainer(commandStream, csrDeps); + TimestampPacketHelper::programCsrDependenciesForForMultiRootDeviceSyncContainer(commandStream, csrDeps); } if (enqueueWithBlitAuxTranslation) { @@ -240,6 +240,13 @@ cl_int CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, } bool flushDependenciesForNonKernelCommand = false; + TagNodeBase *multiRootEventSyncStamp = nullptr; + if (eventBuilder.getEvent() && eventBuilder.getEvent()->getContext()->getRootDeviceIndices().size() > 1) { + multiRootEventSyncStamp = eventBuilder.getEvent()->getMultiRootTimestampSyncNode(); + if (!blockQueue) { + this->getGpgpuCommandStreamReceiver().makeResident(*multiRootEventSyncStamp->getBaseGraphicsAllocation()); + } + } if (multiDispatchInfo.empty() == false) { processDispatchForKernels(multiDispatchInfo, printfHandler, eventBuilder.getEvent(), @@ -381,7 +388,8 @@ cl_int CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, eventsRequest, eventBuilder, std::move(printfHandler), - nullptr); + nullptr, + multiRootEventSyncStamp); } if (deferredTimestampPackets.get()) { @@ -474,6 +482,10 @@ void CommandQueueHw::processDispatchForKernels(const MultiDispatchInf dispatchWalkerArgs.commandType = commandType; dispatchWalkerArgs.event = event; + if (event && event->getMultiRootDeviceTimestampPacketNodes() && !event->getMultiRootDeviceTimestampPacketNodes()->peekNodes().empty()) { + dispatchWalkerArgs.multiRootDeviceEventStamp = event->getMultiRootDeviceTimestampPacketNodes()->peekNodes().at(0u); + } + HardwareInterface::dispatchWalker( *this, multiDispatchInfo, @@ -496,7 +508,7 @@ BlitProperties CommandQueueHw::processDispatchForBlitEnqueue(CommandS const MultiDispatchInfo &multiDispatchInfo, TimestampPacketDependencies ×tampPacketDependencies, const EventsRequest &eventsRequest, LinearStream *commandStream, - uint32_t commandType, bool queueBlocked) { + uint32_t commandType, bool queueBlocked, TagNodeBase *multiRootDeviceEventSync) { auto blitDirection = ClBlitProperties::obtainBlitDirection(commandType); auto blitProperties = ClBlitProperties::constructProperties(blitDirection, blitCommandStreamReceiver, @@ -509,7 +521,7 @@ BlitProperties CommandQueueHw::processDispatchForBlitEnqueue(CommandS blitProperties.csrDependencies.timestampPacketContainer.push_back(×tampPacketDependencies.previousEnqueueNodes); blitProperties.csrDependencies.timestampPacketContainer.push_back(×tampPacketDependencies.barrierNodes); } - + blitProperties.multiRootDeviceEventSync = multiRootDeviceEventSync; auto currentTimestampPacketNode = timestampPacketContainer->peekNodes().at(0); blitProperties.outputTimestampPacket = currentTimestampPacketNode; @@ -614,6 +626,19 @@ void CommandQueueHw::processDispatchForMarker(CommandQueue &commandQu HardwareInterface::dispatchProfilingPerfStartCommands(hwTimeStamps, hwPerfCounter, commandStream, commandQueue); HardwareInterface::dispatchProfilingPerfEndCommands(hwTimeStamps, hwPerfCounter, commandStream, commandQueue); getGpgpuCommandStreamReceiver().makeResident(*hwTimeStamps->getBaseGraphicsAllocation()); + if (event->getMultiRootDeviceTimestampPacketNodes() && !event->getMultiRootDeviceTimestampPacketNodes()->peekNodes().empty()) { + auto node = *(event->getMultiRootDeviceTimestampPacketNodes()->peekNodes().end() - 1); + const auto &hwInfo = commandQueue.getDevice().getHardwareInfo(); + NEO::PipeControlArgs args = {}; + args.dcFlushEnable = MemorySynchronizationCommands::getDcFlushEnable(true, hwInfo); + MemorySynchronizationCommands::addBarrierWithPostSyncOperation( + *commandStream, + PostSyncMode::ImmediateData, + node->getGpuAddress() + node->getContextEndOffset(), + std::numeric_limits::max(), + hwInfo, + args); + } } template @@ -634,6 +659,22 @@ void CommandQueueHw::processDispatchForMarkerWithTimestampPacket(Comm EncodeStoreMMIO::encode(*commandStream, GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, timestampContextEndGpuAddress, false); EncodeStoreMMIO::encode(*commandStream, REG_GLOBAL_TIMESTAMP_LDW, timestampGlobalEndAddress, false); + if (eventsRequest.outEvent != nullptr) { + auto event = castToObjectOrAbort(*eventsRequest.outEvent); + if (event->getMultiRootDeviceTimestampPacketNodes() && !event->getMultiRootDeviceTimestampPacketNodes()->peekNodes().empty()) { + auto node = *(event->getMultiRootDeviceTimestampPacketNodes()->peekNodes().end() - 1); + const auto &hwInfo = commandQueue.getDevice().getHardwareInfo(); + NEO::PipeControlArgs args = {}; + args.dcFlushEnable = MemorySynchronizationCommands::getDcFlushEnable(true, hwInfo); + MemorySynchronizationCommands::addBarrierWithPostSyncOperation( + *commandStream, + PostSyncMode::ImmediateData, + node->getGpuAddress() + node->getContextEndOffset(), + std::numeric_limits::max(), + hwInfo, + args); + } + } } template @@ -900,7 +941,8 @@ void CommandQueueHw::enqueueBlocked( EventsRequest &eventsRequest, EventBuilder &externalEventBuilder, std::unique_ptr &&printfHandler, - CommandStreamReceiver *bcsCsr) { + CommandStreamReceiver *bcsCsr, + TagNodeBase *multiRootDeviceSyncNode) { TakeOwnershipWrapper> queueOwnership(*this); @@ -971,7 +1013,8 @@ void CommandQueueHw::enqueueBlocked( std::move(printfHandler), preemptionMode, multiDispatchInfo.peekMainKernel(), - (uint32_t)multiDispatchInfo.size()); + (uint32_t)multiDispatchInfo.size(), + multiRootDeviceSyncNode); } if (storeTimestampPackets) { command->setTimestampPacketNode(*timestampPacketContainer, std::move(timestampPacketDependencies)); @@ -1273,10 +1316,14 @@ cl_int CommandQueueHw::enqueueBlit(const MultiDispatchInfo &multiDisp } TimestampPacketDependencies timestampPacketDependencies; + TagNodeBase *multiRootEventSyncStamp = nullptr; BlitPropertiesContainer blitPropertiesContainer; CsrDependencies csrDeps; eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, bcsCsr, CsrDependencies::DependenciesType::All); + if (this->context->getRootDeviceIndices().size() > 1) { + eventsRequest.fillCsrDependenciesForRootDevices(csrDeps, bcsCsr); + } auto allocator = bcsCsr.getTimestampPacketAllocator(); if (!blockQueue) { @@ -1299,6 +1346,10 @@ cl_int CommandQueueHw::enqueueBlit(const MultiDispatchInfo &multiDisp if (eventBuilder.getEvent()) { eventBuilder.getEvent()->addTimestampPacketNodes(*timestampPacketContainer); } + if (eventBuilder.getEvent() && eventBuilder.getEvent()->getContext()->getRootDeviceIndices().size() > 1) { + multiRootEventSyncStamp = eventBuilder.getEvent()->getMultiRootTimestampSyncNode(); + this->getGpgpuCommandStreamReceiver().makeResident(*multiRootEventSyncStamp->getBaseGraphicsAllocation()); + } CompletionStamp completionStamp = {CompletionStamp::notReady, taskLevel, 0}; @@ -1315,7 +1366,7 @@ cl_int CommandQueueHw::enqueueBlit(const MultiDispatchInfo &multiDisp } blitPropertiesContainer.push_back(processDispatchForBlitEnqueue(bcsCsr, multiDispatchInfo, timestampPacketDependencies, - eventsRequest, gpgpuCommandStream, cmdType, blockQueue)); + eventsRequest, gpgpuCommandStream, cmdType, blockQueue, multiRootEventSyncStamp)); if (!blockQueue) { completionStamp = enqueueCommandWithoutKernel(nullptr, 0, gpgpuCommandStream, gpgpuCommandStreamStart, blocking, @@ -1342,7 +1393,7 @@ cl_int CommandQueueHw::enqueueBlit(const MultiDispatchInfo &multiDisp updateFromCompletionStamp(completionStamp, eventBuilder.getEvent()); if (blockQueue) { - enqueueBlocked(cmdType, nullptr, 0, multiDispatchInfo, timestampPacketDependencies, blockedCommandsData, enqueueProperties, eventsRequest, eventBuilder, nullptr, &bcsCsr); + enqueueBlocked(cmdType, nullptr, 0, multiDispatchInfo, timestampPacketDependencies, blockedCommandsData, enqueueProperties, eventsRequest, eventBuilder, nullptr, &bcsCsr, multiRootEventSyncStamp); if (gpgpuSubmission) { if (DebugManager.flags.ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission.get() == 1) { diff --git a/opencl/source/command_queue/gpgpu_walker.h b/opencl/source/command_queue/gpgpu_walker.h index f577631992..8e7dc560a6 100644 --- a/opencl/source/command_queue/gpgpu_walker.h +++ b/opencl/source/command_queue/gpgpu_walker.h @@ -89,7 +89,7 @@ class GpgpuWalkerHelper { template struct EnqueueOperation { using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL; - static size_t getTotalSizeRequiredCS(uint32_t eventType, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace, bool reservePerfCounters, bool blitEnqueue, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo, bool isMarkerWithProfiling, bool eventsInWaitList); + static size_t getTotalSizeRequiredCS(uint32_t eventType, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace, bool reservePerfCounters, bool blitEnqueue, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo, bool isMarkerWithProfiling, bool eventsInWaitList, cl_event *outEvent); static size_t getSizeRequiredCS(uint32_t cmdType, bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel, const DispatchInfo &dispatchInfo); static size_t getSizeRequiredForTimestampPacketWrite(); static size_t getSizeForCacheFlushAfterWalkerCommands(const Kernel &kernel, const CommandQueue &commandQueue); @@ -102,8 +102,8 @@ struct EnqueueOperation { template LinearStream &getCommandStream(CommandQueue &commandQueue, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, bool blitEnqueue, const MultiDispatchInfo &multiDispatchInfo, - Surface **surfaces, size_t numSurfaces, bool isMarkerWithProfiling, bool eventsInWaitList) { - size_t expectedSizeCS = EnqueueOperation::getTotalSizeRequiredCS(eventType, csrDeps, reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, blitEnqueue, commandQueue, multiDispatchInfo, isMarkerWithProfiling, eventsInWaitList); + Surface **surfaces, size_t numSurfaces, bool isMarkerWithProfiling, bool eventsInWaitList, cl_event *outEvent) { + size_t expectedSizeCS = EnqueueOperation::getTotalSizeRequiredCS(eventType, csrDeps, reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, blitEnqueue, commandQueue, multiDispatchInfo, isMarkerWithProfiling, eventsInWaitList, outEvent); return commandQueue.getCS(expectedSizeCS); } diff --git a/opencl/source/command_queue/gpgpu_walker_base.inl b/opencl/source/command_queue/gpgpu_walker_base.inl index b1dda219dd..98af358a86 100644 --- a/opencl/source/command_queue/gpgpu_walker_base.inl +++ b/opencl/source/command_queue/gpgpu_walker_base.inl @@ -165,7 +165,7 @@ size_t GpgpuWalkerHelper::getSizeForWaDisableRccRhwoOptimization(cons } template -size_t EnqueueOperation::getTotalSizeRequiredCS(uint32_t eventType, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace, bool reservePerfCounters, bool blitEnqueue, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo, bool isMarkerWithProfiling, bool eventsInWaitlist) { +size_t EnqueueOperation::getTotalSizeRequiredCS(uint32_t eventType, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace, bool reservePerfCounters, bool blitEnqueue, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo, bool isMarkerWithProfiling, bool eventsInWaitlist, cl_event *outEvent) { size_t expectedSizeCS = 0; auto &hwInfo = commandQueue.getDevice().getHardwareInfo(); auto &gfxCoreHelper = commandQueue.getDevice().getGfxCoreHelper(); @@ -216,8 +216,14 @@ size_t EnqueueOperation::getTotalSizeRequiredCS(uint32_t eventType, c if (DebugManager.flags.GpuScratchRegWriteAfterWalker.get() != -1) { expectedSizeCS += sizeof(typename GfxFamily::MI_LOAD_REGISTER_IMM); } - - expectedSizeCS += TimestampPacketHelper::getRequiredCmdStreamSizeForTaskCountContainer(csrDeps); + expectedSizeCS += TimestampPacketHelper::getRequiredCmdStreamSizeForMultiRootDeviceSyncNodesContainer(csrDeps); + if (outEvent) { + auto pEvent = castToObjectOrAbort(*outEvent); + if ((pEvent->getContext()->getRootDeviceIndices().size() > 1) && (!pEvent->isUserEvent())) { + expectedSizeCS += MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hwInfo, false); + } + } + expectedSizeCS += MemorySynchronizationCommands::getSizeForSingleBarrier(false); return expectedSizeCS; } diff --git a/opencl/source/command_queue/hardware_interface.h b/opencl/source/command_queue/hardware_interface.h index a88e1c6d4f..81b2d90dff 100644 --- a/opencl/source/command_queue/hardware_interface.h +++ b/opencl/source/command_queue/hardware_interface.h @@ -31,6 +31,7 @@ struct HardwareInterfaceWalkerArgs { size_t localWorkSizes[3] = {}; TagNodeBase *hwTimeStamps = nullptr; TagNodeBase *hwPerfCounter = nullptr; + TagNodeBase *multiRootDeviceEventStamp = nullptr; TimestampPacketDependencies *timestampPacketDependencies = nullptr; TimestampPacketContainer *currentTimestampPacketNodes = nullptr; const Vec3 *numberOfWorkgroups = nullptr; diff --git a/opencl/source/command_queue/hardware_interface_base.inl b/opencl/source/command_queue/hardware_interface_base.inl index a514157406..a50bf9a709 100644 --- a/opencl/source/command_queue/hardware_interface_base.inl +++ b/opencl/source/command_queue/hardware_interface_base.inl @@ -133,6 +133,7 @@ void HardwareInterface::dispatchWalker( walkerArgs.currentTimestampPacketNodes); walkerArgs.currentDispatchIndex = 0; + for (auto &dispatchInfo : multiDispatchInfo) { dispatchInfo.dispatchInitCommands(*commandStream, walkerArgs.timestampPacketDependencies, commandQueue.getDevice().getHardwareInfo()); walkerArgs.isMainKernel = (dispatchInfo.getKernel() == mainKernel); @@ -143,6 +144,19 @@ void HardwareInterface::dispatchWalker( dispatchInfo.dispatchEpilogueCommands(*commandStream, walkerArgs.timestampPacketDependencies, commandQueue.getDevice().getHardwareInfo()); } + if (walkerArgs.multiRootDeviceEventStamp != nullptr) { + const auto &hwInfo = commandQueue.getDevice().getHardwareInfo(); + PipeControlArgs args; + args.dcFlushEnable = MemorySynchronizationCommands::getDcFlushEnable(true, hwInfo); + MemorySynchronizationCommands::addBarrierWithPostSyncOperation( + *commandStream, + PostSyncMode::ImmediateData, + walkerArgs.multiRootDeviceEventStamp->getGpuAddress() + walkerArgs.multiRootDeviceEventStamp->getContextEndOffset(), + std::numeric_limits::max(), + hwInfo, + args); + } + if (mainKernel->requiresCacheFlushCommand(commandQueue)) { uint64_t postSyncAddress = 0; if (commandQueue.getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { diff --git a/opencl/source/context/context.cpp b/opencl/source/context/context.cpp index 3ec69a2036..f4a5328994 100644 --- a/opencl/source/context/context.cpp +++ b/opencl/source/context/context.cpp @@ -45,7 +45,9 @@ Context::Context( Context::~Context() { gtpinNotifyContextDestroy((cl_context)this); - + if (multiRootDeviceTimestampPacketAllocator.get() != nullptr) { + multiRootDeviceTimestampPacketAllocator.reset(); + } if (smallBufferPoolAllocator.isAggregatedSmallBuffersEnabled(this)) { smallBufferPoolAllocator.releaseSmallBufferPool(); } @@ -555,5 +557,15 @@ void Context::BufferPoolAllocator::releaseSmallBufferPool() { delete this->mainStorage; this->mainStorage = nullptr; } +TagAllocatorBase *Context::getMultiRootDeviceTimestampPacketAllocator() { + return multiRootDeviceTimestampPacketAllocator.get(); +} +void Context::setMultiRootDeviceTimestampPacketAllocator(std::unique_ptr &allocator) { + multiRootDeviceTimestampPacketAllocator = std::move(allocator); +} + +std::unique_lock Context::obtainOwnershipForMultiRootDeviceAllocator() { + return std::unique_lock(multiRootDeviceAllocatorMtx); +} } // namespace NEO diff --git a/opencl/source/context/context.h b/opencl/source/context/context.h index 54d8cfcdc6..d6dd69d167 100644 --- a/opencl/source/context/context.h +++ b/opencl/source/context/context.h @@ -34,6 +34,7 @@ class SharingFunctions; class SVMAllocsManager; class Program; class Platform; +class TagAllocatorBase; template <> struct OpenCLObjectMapper<_cl_context> { @@ -220,6 +221,9 @@ class Context : public BaseObject<_cl_context> { BufferPoolAllocator &getBufferPoolAllocator() { return this->smallBufferPoolAllocator; } + TagAllocatorBase *getMultiRootDeviceTimestampPacketAllocator(); + std::unique_lock obtainOwnershipForMultiRootDeviceAllocator(); + void setMultiRootDeviceTimestampPacketAllocator(std::unique_ptr &allocator); protected: struct BuiltInKernel { @@ -260,6 +264,8 @@ class Context : public BaseObject<_cl_context> { uint32_t maxRootDeviceIndex = std::numeric_limits::max(); cl_bool preferD3dSharedResources = 0u; ContextType contextType = ContextType::CONTEXT_TYPE_DEFAULT; + std::unique_ptr multiRootDeviceTimestampPacketAllocator; + std::mutex multiRootDeviceAllocatorMtx; bool interopUserSync = false; bool resolvesRequiredInKernels = false; diff --git a/opencl/source/event/event.cpp b/opencl/source/event/event.cpp index 05e4d6fef0..6a82308e9e 100644 --- a/opencl/source/event/event.cpp +++ b/opencl/source/event/event.cpp @@ -129,6 +129,9 @@ Event::~Event() { if (timeStampNode != nullptr) { timeStampNode->returnTag(); } + if (multiRootTimeStampSyncNode != nullptr) { + multiRootTimeStampSyncNode->returnTag(); + } if (perfCounterNode != nullptr) { cmdQueue->getPerfCounters()->deleteQuery(perfCounterNode->getQueryHandleRef()); perfCounterNode->getQueryHandleRef() = {}; @@ -875,7 +878,6 @@ TagNodeBase *Event::getHwTimeStampNode() { } TagNodeBase *Event::getHwPerfCounterNode() { - if (!perfCounterNode && cmdQueue->getPerfCounters()) { const uint32_t gpuReportSize = HwPerfCounter::getSize(*(cmdQueue->getPerfCounters())); perfCounterNode = cmdQueue->getGpgpuCommandStreamReceiver().getEventPerfCountAllocator(gpuReportSize)->getTag(); @@ -883,11 +885,27 @@ TagNodeBase *Event::getHwPerfCounterNode() { return perfCounterNode; } +TagNodeBase *Event::getMultiRootTimestampSyncNode() { + auto lock = getContext()->obtainOwnershipForMultiRootDeviceAllocator(); + if (getContext()->getMultiRootDeviceTimestampPacketAllocator() == nullptr) { + auto allocator = cmdQueue->getGpgpuCommandStreamReceiver().createMultiRootDeviceTimestampPacketAllocator(getContext()->getRootDeviceIndices()); + getContext()->setMultiRootDeviceTimestampPacketAllocator(allocator); + } + lock.unlock(); + if (multiRootDeviceTimestampPacketContainer.get() == nullptr) { + multiRootDeviceTimestampPacketContainer = std::make_unique(); + } + multiRootTimeStampSyncNode = getContext()->getMultiRootDeviceTimestampPacketAllocator()->getTag(); + multiRootDeviceTimestampPacketContainer->add(multiRootTimeStampSyncNode); + return multiRootTimeStampSyncNode; +} + void Event::addTimestampPacketNodes(const TimestampPacketContainer &inputTimestampPacketContainer) { timestampPacketContainer->assignAndIncrementNodesRefCounts(inputTimestampPacketContainer); } TimestampPacketContainer *Event::getTimestampPacketNodes() const { return timestampPacketContainer.get(); } +TimestampPacketContainer *Event::getMultiRootDeviceTimestampPacketNodes() const { return multiRootDeviceTimestampPacketContainer.get(); } bool Event::checkUserEventDependencies(cl_uint numEventsInWaitList, const cl_event *eventWaitList) { bool userEventsDependencies = false; diff --git a/opencl/source/event/event.h b/opencl/source/event/event.h index 7468754286..688f2ed516 100644 --- a/opencl/source/event/event.h +++ b/opencl/source/event/event.h @@ -114,6 +114,7 @@ class Event : public BaseObject<_cl_event>, public IDNode { void addTimestampPacketNodes(const TimestampPacketContainer &inputTimestampPacketContainer); TimestampPacketContainer *getTimestampPacketNodes() const; + TimestampPacketContainer *getMultiRootDeviceTimestampPacketNodes() const; bool isPerfCountersEnabled() const { return perfCountersEnabled; @@ -128,6 +129,7 @@ class Event : public BaseObject<_cl_event>, public IDNode { } TagNodeBase *getHwPerfCounterNode(); + TagNodeBase *getMultiRootTimestampSyncNode(); std::unique_ptr flushStamp; std::atomic taskLevel; @@ -386,7 +388,9 @@ class Event : public BaseObject<_cl_event>, public IDNode { bool perfCountersEnabled; TagNodeBase *timeStampNode = nullptr; TagNodeBase *perfCounterNode = nullptr; + TagNodeBase *multiRootTimeStampSyncNode = nullptr; std::unique_ptr timestampPacketContainer; + std::unique_ptr multiRootDeviceTimestampPacketContainer; //number of events this event depends on std::atomic parentCount; //event parents diff --git a/opencl/source/gen11/command_queue_gen11.cpp b/opencl/source/gen11/command_queue_gen11.cpp index c1efcf0cea..2a6e012be0 100644 --- a/opencl/source/gen11/command_queue_gen11.cpp +++ b/opencl/source/gen11/command_queue_gen11.cpp @@ -16,8 +16,8 @@ namespace NEO { typedef Gen11Family Family; +#include "opencl/source/command_queue/command_queue_process_dispatch_for_kernels_instance.inl" static auto gfxCore = IGFX_GEN11_CORE; - template class CommandQueueHw; template <> diff --git a/opencl/source/gen12lp/command_queue_gen12lp.cpp b/opencl/source/gen12lp/command_queue_gen12lp.cpp index 7ad2d1f9e9..67cfa54a4e 100644 --- a/opencl/source/gen12lp/command_queue_gen12lp.cpp +++ b/opencl/source/gen12lp/command_queue_gen12lp.cpp @@ -14,16 +14,13 @@ #include "command_queue_helpers_gen12lp.inl" namespace NEO { - typedef Gen12LpFamily Family; +#include "opencl/source/command_queue/command_queue_process_dispatch_for_kernels_instance.inl" static auto gfxCore = IGFX_GEN12LP_CORE; - template <> void populateFactoryTable>() { extern CommandQueueCreateFunc commandQueueFactory[IGFX_MAX_CORE]; commandQueueFactory[gfxCore] = CommandQueueHw::create; } - template class CommandQueueHw; - } // namespace NEO diff --git a/opencl/source/gen8/command_queue_gen8.cpp b/opencl/source/gen8/command_queue_gen8.cpp index dc3fcae08e..29a5613ae8 100644 --- a/opencl/source/gen8/command_queue_gen8.cpp +++ b/opencl/source/gen8/command_queue_gen8.cpp @@ -16,8 +16,8 @@ namespace NEO { typedef Gen8Family Family; +#include "opencl/source/command_queue/command_queue_process_dispatch_for_kernels_instance.inl" static auto gfxCore = IGFX_GEN8_CORE; - template class CommandQueueHw; template <> diff --git a/opencl/source/gen9/command_queue_gen9.cpp b/opencl/source/gen9/command_queue_gen9.cpp index 8dddd508a9..2c3824fad7 100644 --- a/opencl/source/gen9/command_queue_gen9.cpp +++ b/opencl/source/gen9/command_queue_gen9.cpp @@ -16,8 +16,10 @@ namespace NEO { typedef Gen9Family Family; +#include "opencl/source/command_queue/command_queue_process_dispatch_for_kernels_instance.inl" +} // namespace NEO +namespace NEO { static auto gfxCore = IGFX_GEN9_CORE; - template class CommandQueueHw; template <> diff --git a/opencl/source/helpers/properties_helper.cpp b/opencl/source/helpers/properties_helper.cpp index eecd1b954a..6cd42d1aec 100644 --- a/opencl/source/helpers/properties_helper.cpp +++ b/opencl/source/helpers/properties_helper.cpp @@ -20,7 +20,6 @@ namespace NEO { void flushDependentCsr(CommandStreamReceiver &dependentCsr, CsrDependencies &csrDeps) { auto csrOwnership = dependentCsr.obtainUniqueOwnership(); dependentCsr.updateTagFromWait(); - csrDeps.taskCountContainer.push_back({dependentCsr.peekTaskCount(), reinterpret_cast(dependentCsr.getTagAddress())}); } void EventsRequest::fillCsrDependenciesForTimestampPacketContainer(CsrDependencies &csrDeps, CommandStreamReceiver ¤tCsr, CsrDependencies::DependenciesType depsType) const { @@ -54,6 +53,7 @@ void EventsRequest::fillCsrDependenciesForTimestampPacketContainer(CsrDependenci if (productHelper.isDcFlushAllowed()) { if (!dependentCsr.isLatestTaskCountFlushed()) { flushDependentCsr(dependentCsr, csrDeps); + //csrDeps.taskCountContainer.push_back({dependentCsr.peekTaskCount(), reinterpret_cast(dependentCsr.getTagAddress())}); currentCsr.makeResident(*dependentCsr.getTagAllocation()); } } @@ -62,23 +62,22 @@ void EventsRequest::fillCsrDependenciesForTimestampPacketContainer(CsrDependenci } } -void EventsRequest::fillCsrDependenciesForTaskCountContainer(CsrDependencies &csrDeps, CommandStreamReceiver ¤tCsr) const { +void EventsRequest::fillCsrDependenciesForRootDevices(CsrDependencies &csrDeps, CommandStreamReceiver ¤tCsr) const { for (cl_uint i = 0; i < this->numEventsInWaitList; i++) { auto event = castToObjectOrAbort(this->eventWaitList[i]); if (event->isUserEvent() || CompletionStamp::notReady == event->peekTaskCount()) { continue; } - if (event->getCommandQueue() && event->getCommandQueue()->getDevice().getRootDeviceIndex() != currentCsr.getRootDeviceIndex()) { + auto timestampPacketContainer = event->getMultiRootDeviceTimestampPacketNodes(); + if (!timestampPacketContainer || timestampPacketContainer->peekNodes().empty()) { + continue; + } auto &dependentCsr = event->getCommandQueue()->getGpgpuCommandStreamReceiver(); if (!dependentCsr.isLatestTaskCountFlushed()) { flushDependentCsr(dependentCsr, csrDeps); - } else { - csrDeps.taskCountContainer.push_back({event->peekTaskCount(), reinterpret_cast(dependentCsr.getTagAddress())}); } - - auto graphicsAllocation = event->getCommandQueue()->getGpgpuCommandStreamReceiver().getTagsMultiAllocation()->getGraphicsAllocation(currentCsr.getRootDeviceIndex()); - currentCsr.getResidencyAllocations().push_back(graphicsAllocation); + csrDeps.multiRootTimeStampSyncContainer.push_back(timestampPacketContainer); } } } diff --git a/opencl/source/helpers/properties_helper.h b/opencl/source/helpers/properties_helper.h index ed9b60f423..0448176dc9 100644 --- a/opencl/source/helpers/properties_helper.h +++ b/opencl/source/helpers/properties_helper.h @@ -25,7 +25,7 @@ struct EventsRequest { : numEventsInWaitList(numEventsInWaitList), eventWaitList(eventWaitList), outEvent(outEvent) {} void fillCsrDependenciesForTimestampPacketContainer(CsrDependencies &csrDeps, CommandStreamReceiver ¤tCsr, CsrDependencies::DependenciesType depsType) const; - void fillCsrDependenciesForTaskCountContainer(CsrDependencies &csrDeps, CommandStreamReceiver ¤tCsr) const; + void fillCsrDependenciesForRootDevices(CsrDependencies &csrDeps, CommandStreamReceiver ¤tCsr) const; void setupBcsCsrForOutputEvent(CommandStreamReceiver &bcsCsr) const; cl_uint numEventsInWaitList; diff --git a/opencl/source/helpers/task_information.cpp b/opencl/source/helpers/task_information.cpp index 85da42fb82..5cf945a4cc 100644 --- a/opencl/source/helpers/task_information.cpp +++ b/opencl/source/helpers/task_information.cpp @@ -117,10 +117,11 @@ CompletionStamp &CommandMapUnmap::submit(TaskCountType taskLevel, bool terminate CommandComputeKernel::CommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr &kernelOperation, std::vector surfaces, bool flushDC, bool usesSLM, uint32_t commandType, std::unique_ptr &&printfHandler, - PreemptionMode preemptionMode, Kernel *kernel, uint32_t kernelCount) + PreemptionMode preemptionMode, Kernel *kernel, uint32_t kernelCount, + TagNodeBase *multiRootDeviceSyncNode) : Command(commandQueue, kernelOperation), surfaces(std::move(surfaces)), flushDC(flushDC), slmUsed(usesSLM), commandType(commandType), printfHandler(std::move(printfHandler)), kernel(kernel), - kernelCount(kernelCount), preemptionMode(preemptionMode) { + kernelCount(kernelCount), preemptionMode(preemptionMode), multiRootDeviceSyncNode(multiRootDeviceSyncNode) { UNRECOVERABLE_IF(nullptr == this->kernel); kernel->incRefInternal(); } @@ -162,6 +163,9 @@ CompletionStamp &CommandComputeKernel::submit(TaskCountType taskLevel, bool term printfHandler->makeResident(commandStreamReceiver); } makeTimestampPacketsResident(commandStreamReceiver); + if (multiRootDeviceSyncNode != nullptr) { + commandStreamReceiver.makeResident(*multiRootDeviceSyncNode->getBaseGraphicsAllocation()); + } if (kernelOperation->blitPropertiesContainer.size() > 0) { CsrDependencies csrDeps; @@ -213,7 +217,7 @@ CompletionStamp &CommandComputeKernel::submit(TaskCountType taskLevel, bool term false); // hasRelaxedOrderingDependencies if (commandQueue.getContext().getRootDeviceIndices().size() > 1) { - eventsRequest.fillCsrDependenciesForTaskCountContainer(dispatchFlags.csrDependencies, commandStreamReceiver); + eventsRequest.fillCsrDependenciesForRootDevices(dispatchFlags.csrDependencies, commandStreamReceiver); } const bool isHandlingBarrier = commandQueue.getGpgpuCommandStreamReceiver().isStallingCommandsOnNextFlushRequired(); @@ -306,7 +310,7 @@ TaskCountType CommandWithoutKernel::dispatchBlitOperation() { blitProperties.outputTimestampPacket = currentTimestampPacketNodes->peekNodes()[0]; if (commandQueue.getContext().getRootDeviceIndices().size() > 1) { - eventsRequest.fillCsrDependenciesForTaskCountContainer(blitProperties.csrDependencies, *bcsCsr); + eventsRequest.fillCsrDependenciesForRootDevices(blitProperties.csrDependencies, *bcsCsr); } const auto newTaskCount = bcsCsr->flushBcsTask(kernelOperation->blitPropertiesContainer, false, commandQueue.isProfilingEnabled(), commandQueue.getDevice()); @@ -388,7 +392,7 @@ CompletionStamp &CommandWithoutKernel::submit(TaskCountType taskLevel, bool term false); // hasRelaxedOrderingDependencies if (commandQueue.getContext().getRootDeviceIndices().size() > 1) { - eventsRequest.fillCsrDependenciesForTaskCountContainer(dispatchFlags.csrDependencies, commandStreamReceiver); + eventsRequest.fillCsrDependenciesForRootDevices(dispatchFlags.csrDependencies, commandStreamReceiver); } const bool isHandlingBarrier = commandQueue.getGpgpuCommandStreamReceiver().isStallingCommandsOnNextFlushRequired(); diff --git a/opencl/source/helpers/task_information.h b/opencl/source/helpers/task_information.h index decbf2e112..54cc137710 100644 --- a/opencl/source/helpers/task_information.h +++ b/opencl/source/helpers/task_information.h @@ -131,7 +131,7 @@ class CommandComputeKernel : public Command { public: CommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr &kernelOperation, std::vector surfaces, bool flushDC, bool usesSLM, uint32_t commandType, std::unique_ptr &&printfHandler, - PreemptionMode preemptionMode, Kernel *kernel, uint32_t kernelCount); + PreemptionMode preemptionMode, Kernel *kernel, uint32_t kernelCount, TagNodeBase *multiRootDeviceSyncNode); ~CommandComputeKernel() override; @@ -150,6 +150,7 @@ class CommandComputeKernel : public Command { Kernel *kernel; uint32_t kernelCount; PreemptionMode preemptionMode; + TagNodeBase *multiRootDeviceSyncNode; }; class CommandWithoutKernel : public Command { diff --git a/opencl/source/xe_hp_core/command_queue_xe_hp_core.cpp b/opencl/source/xe_hp_core/command_queue_xe_hp_core.cpp index 7c502707e5..b89f8cff97 100644 --- a/opencl/source/xe_hp_core/command_queue_xe_hp_core.cpp +++ b/opencl/source/xe_hp_core/command_queue_xe_hp_core.cpp @@ -26,7 +26,6 @@ void populateFactoryTable>() { extern CommandQueueCreateFunc commandQueueFactory[IGFX_MAX_CORE]; commandQueueFactory[gfxCore] = CommandQueueHw::create; } - } // namespace NEO template class NEO::CommandQueueHw; diff --git a/opencl/source/xe_hpc_core/command_queue_xe_hpc_core.cpp b/opencl/source/xe_hpc_core/command_queue_xe_hpc_core.cpp index 56cc5c5a63..2e8499d09a 100644 --- a/opencl/source/xe_hpc_core/command_queue_xe_hpc_core.cpp +++ b/opencl/source/xe_hpc_core/command_queue_xe_hpc_core.cpp @@ -15,6 +15,7 @@ namespace NEO { using Family = XeHpcCoreFamily; +#include "opencl/source/command_queue/command_queue_process_dispatch_for_kernels_instance.inl" static auto gfxCore = IGFX_XE_HPC_CORE; } // namespace NEO diff --git a/opencl/source/xe_hpg_core/command_queue_xe_hpg_core.cpp b/opencl/source/xe_hpg_core/command_queue_xe_hpg_core.cpp index 3f0647ce99..bf5f5b94a4 100644 --- a/opencl/source/xe_hpg_core/command_queue_xe_hpg_core.cpp +++ b/opencl/source/xe_hpg_core/command_queue_xe_hpg_core.cpp @@ -21,6 +21,7 @@ static auto gfxCore = IGFX_XE_HPG_CORE; #include "opencl/source/command_queue/command_queue_hw_xehp_and_later.inl" namespace NEO { +#include "opencl/source/command_queue/command_queue_process_dispatch_for_kernels_instance.inl" template <> void populateFactoryTable>() { extern CommandQueueCreateFunc commandQueueFactory[IGFX_MAX_CORE]; diff --git a/opencl/test/black_box_test/hello_world_opencl.cpp b/opencl/test/black_box_test/hello_world_opencl.cpp index d63f1749dc..747d6f84ca 100644 --- a/opencl/test/black_box_test/hello_world_opencl.cpp +++ b/opencl/test/black_box_test/hello_world_opencl.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2021 Intel Corporation + * Copyright (C) 2020-2022 Intel Corporation * * SPDX-License-Identifier: MIT * diff --git a/opencl/test/unit_test/command_queue/command_queue_hw_1_tests.cpp b/opencl/test/unit_test/command_queue/command_queue_hw_1_tests.cpp index 782b71dfc9..73a9778faa 100644 --- a/opencl/test/unit_test/command_queue/command_queue_hw_1_tests.cpp +++ b/opencl/test/unit_test/command_queue/command_queue_hw_1_tests.cpp @@ -19,6 +19,7 @@ #include "opencl/test/unit_test/command_queue/command_queue_fixture.h" #include "opencl/test/unit_test/fixtures/buffer_fixture.h" #include "opencl/test/unit_test/fixtures/image_fixture.h" +#include "opencl/test/unit_test/helpers/cl_hw_parse.h" #include "opencl/test/unit_test/mocks/mock_command_queue.h" #include "opencl/test/unit_test/mocks/mock_event.h" #include "opencl/test/unit_test/mocks/mock_kernel.h" @@ -193,6 +194,7 @@ class MockCommandStreamReceiverWithFailingFlushBatchedSubmission : public MockCo template struct MockCommandQueueHwWithOverwrittenCsr : public CommandQueueHw { using CommandQueueHw::CommandQueueHw; + using CommandQueueHw::timestampPacketContainer; MockCommandStreamReceiverWithFailingFlushBatchedSubmission *csr; CommandStreamReceiver &getGpgpuCommandStreamReceiver() const override { return *csr; } }; @@ -218,6 +220,243 @@ HWTEST_F(CommandQueueHwTest, GivenCommandQueueWhenProcessDispatchForMarkerCalled EXPECT_GT(csr.makeResidentCalledTimes, 0u); } +HWTEST_F(CommandQueueHwTest, GivenEventWithRootDeviceSyncNodesWhenProcessDispatchForMarkerCalledThenEndNodeWillBeSignaledByPipeControl) { + + pDevice->getUltCommandStreamReceiver().timestampPacketWriteEnabled = false; + MockCommandStreamReceiverWithFailingFlushBatchedSubmission csr(*pDevice->getExecutionEnvironment(), 0, pDevice->getDeviceBitfield()); + auto mockTagAllocator = std::make_unique>(pCmdQ->getContextPtr()->getRootDeviceIndices(), pDevice->getExecutionEnvironment()->memoryManager.get(), 10u); + std::unique_ptr uniquePtr(mockTagAllocator.release()); + pCmdQ->getContextPtr()->setMultiRootDeviceTimestampPacketAllocator(uniquePtr); + auto myCmdQ = std::make_unique>(pCmdQ->getContextPtr(), pClDevice, nullptr, false); + myCmdQ->csr = &csr; + csr.osContext = &pCmdQ->getGpgpuCommandStreamReceiver().getOsContext(); + std::unique_ptr event(new Event(myCmdQ.get(), CL_COMMAND_COPY_BUFFER, 0, 0)); + event->getMultiRootTimestampSyncNode(); + event->getMultiRootTimestampSyncNode(); + auto lastNode = event->getMultiRootTimestampSyncNode(); + ASSERT_NE(nullptr, event); + + cl_event clEvent = event.get(); + EventsRequest eventsRequest(0, nullptr, &clEvent); + uint32_t streamBuffer[100] = {}; + NEO::LinearStream linearStream(streamBuffer, sizeof(streamBuffer)); + CsrDependencies deps = {}; + myCmdQ->processDispatchForMarker(*myCmdQ.get(), &linearStream, eventsRequest, deps); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, ptrOffset(linearStream.getCpuBase(), 0), linearStream.getUsed())); + auto itor = find(cmdList.begin(), cmdList.end()); + EXPECT_NE(cmdList.end(), itor); + bool expectedAddressInPipeControl = false; + while (itor != cmdList.end()) { + auto pipeControlCmd = reinterpret_cast(*itor); + uint64_t addressHigh = pipeControlCmd->getAddressHigh(); + uint64_t addressLow = pipeControlCmd->getAddress(); + addressHigh = addressHigh << 32; + uint64_t address = addressHigh | addressLow; + if (address == lastNode->getGpuAddress() + lastNode->getContextEndOffset()) { + expectedAddressInPipeControl = true; + break; + } + itor = find(++itor, cmdList.end()); + } + EXPECT_TRUE(expectedAddressInPipeControl); +} + +HWTEST_F(CommandQueueHwTest, GivenEventWithEmptyRootDeviceSyncNodesContainerWhenProcessDispatchForMarkerCalledThenSyncPipeControlIsNotProgrammed) { + + pDevice->getUltCommandStreamReceiver().timestampPacketWriteEnabled = false; + MockCommandStreamReceiverWithFailingFlushBatchedSubmission csr(*pDevice->getExecutionEnvironment(), 0, pDevice->getDeviceBitfield()); + auto mockTagAllocator = std::make_unique>(pCmdQ->getContextPtr()->getRootDeviceIndices(), pDevice->getExecutionEnvironment()->memoryManager.get(), 10u); + std::unique_ptr uniquePtr(mockTagAllocator.release()); + pCmdQ->getContextPtr()->setMultiRootDeviceTimestampPacketAllocator(uniquePtr); + auto myCmdQ = std::make_unique>(pCmdQ->getContextPtr(), pClDevice, nullptr, false); + myCmdQ->csr = &csr; + csr.osContext = &pCmdQ->getGpgpuCommandStreamReceiver().getOsContext(); + std::unique_ptr> event(new MockEvent(myCmdQ.get(), CL_COMMAND_COPY_BUFFER, 0, 0)); + auto node1 = event->getMultiRootTimestampSyncNode(); + auto node2 = event->getMultiRootTimestampSyncNode(); + auto node3 = event->getMultiRootTimestampSyncNode(); + event->multiRootDeviceTimestampPacketContainer.reset(new TimestampPacketContainer()); + ASSERT_NE(nullptr, event); + + cl_event clEvent = event.get(); + EventsRequest eventsRequest(0, nullptr, &clEvent); + uint32_t streamBuffer[100] = {}; + NEO::LinearStream linearStream(streamBuffer, sizeof(streamBuffer)); + CsrDependencies deps = {}; + myCmdQ->processDispatchForMarker(*myCmdQ.get(), &linearStream, eventsRequest, deps); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, ptrOffset(linearStream.getCpuBase(), 0), linearStream.getUsed())); + auto itor = find(cmdList.begin(), cmdList.end()); + bool notExpectedAddressInPipeControl = true; + while (itor != cmdList.end()) { + auto pipeControlCmd = reinterpret_cast(*itor); + uint64_t addressHigh = pipeControlCmd->getAddressHigh(); + uint64_t addressLow = pipeControlCmd->getAddress(); + addressHigh = addressHigh << 32; + uint64_t address = addressHigh | addressLow; + if (address == node1->getGpuAddress() + node1->getContextEndOffset() || + address == node2->getGpuAddress() + node2->getContextEndOffset() || + address == node3->getGpuAddress() + node3->getContextEndOffset()) { + notExpectedAddressInPipeControl = false; + break; + } + itor = find(++itor, cmdList.end()); + } + EXPECT_TRUE(notExpectedAddressInPipeControl); +} + +HWTEST_F(CommandQueueHwTest, GivenEventWithRootDeviceSyncNodesWhenProcessDispatchForMarkerWithTimestampPacketCalledThenEndNodeWillBeSignaledByPipeControl) { + + pDevice->getUltCommandStreamReceiver().timestampPacketWriteEnabled = false; + MockCommandStreamReceiverWithFailingFlushBatchedSubmission csr(*pDevice->getExecutionEnvironment(), 0, pDevice->getDeviceBitfield()); + auto mockTagAllocator = std::make_unique>(pCmdQ->getContextPtr()->getRootDeviceIndices(), pDevice->getExecutionEnvironment()->memoryManager.get(), 10u); + std::unique_ptr uniquePtr(mockTagAllocator.release()); + pCmdQ->getContextPtr()->setMultiRootDeviceTimestampPacketAllocator(uniquePtr); + auto myCmdQ = std::make_unique>(pCmdQ->getContextPtr(), pClDevice, nullptr, false); + myCmdQ->csr = &csr; + + myCmdQ->timestampPacketContainer = std::make_unique(); + myCmdQ->timestampPacketContainer->add(pCmdQ->getContextPtr()->getMultiRootDeviceTimestampPacketAllocator()->getTag()); + + csr.osContext = &pCmdQ->getGpgpuCommandStreamReceiver().getOsContext(); + std::unique_ptr event(new Event(myCmdQ.get(), CL_COMMAND_COPY_BUFFER, 0, 0)); + event->getMultiRootTimestampSyncNode(); + event->getMultiRootTimestampSyncNode(); + auto lastNode = event->getMultiRootTimestampSyncNode(); + ASSERT_NE(nullptr, event); + + cl_event clEvent = event.get(); + EventsRequest eventsRequest(0, nullptr, &clEvent); + uint32_t streamBuffer[100] = {}; + NEO::LinearStream linearStream(streamBuffer, sizeof(streamBuffer)); + CsrDependencies deps = {}; + myCmdQ->processDispatchForMarkerWithTimestampPacket(*myCmdQ.get(), &linearStream, eventsRequest, deps); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, ptrOffset(linearStream.getCpuBase(), 0), linearStream.getUsed())); + auto itor = find(cmdList.begin(), cmdList.end()); + EXPECT_NE(cmdList.end(), itor); + + bool expectedAddressInPipeControl = false; + while (itor != cmdList.end()) { + auto pipeControlCmd = reinterpret_cast(*itor); + uint64_t addressHigh = pipeControlCmd->getAddressHigh(); + uint64_t addressLow = pipeControlCmd->getAddress(); + addressHigh = addressHigh << 32; + uint64_t address = addressHigh | addressLow; + if (address == lastNode->getGpuAddress() + lastNode->getContextEndOffset()) { + expectedAddressInPipeControl = true; + break; + } + itor = find(++itor, cmdList.end()); + } + EXPECT_TRUE(expectedAddressInPipeControl); +} + +HWTEST_F(CommandQueueHwTest, GivenEventWithEmptyRootDeviceSyncNodesContainerWhenProcessDispatchForMarkerWithTimestampPacketCalledThenSyncPipeControlIsNotProgrammed) { + + pDevice->getUltCommandStreamReceiver().timestampPacketWriteEnabled = false; + MockCommandStreamReceiverWithFailingFlushBatchedSubmission csr(*pDevice->getExecutionEnvironment(), 0, pDevice->getDeviceBitfield()); + auto mockTagAllocator = std::make_unique>(pCmdQ->getContextPtr()->getRootDeviceIndices(), pDevice->getExecutionEnvironment()->memoryManager.get(), 10u); + std::unique_ptr uniquePtr(mockTagAllocator.release()); + pCmdQ->getContextPtr()->setMultiRootDeviceTimestampPacketAllocator(uniquePtr); + auto myCmdQ = std::make_unique>(pCmdQ->getContextPtr(), pClDevice, nullptr, false); + myCmdQ->csr = &csr; + + myCmdQ->timestampPacketContainer = std::make_unique(); + myCmdQ->timestampPacketContainer->add(pCmdQ->getContextPtr()->getMultiRootDeviceTimestampPacketAllocator()->getTag()); + + csr.osContext = &pCmdQ->getGpgpuCommandStreamReceiver().getOsContext(); + std::unique_ptr> event(new MockEvent(myCmdQ.get(), CL_COMMAND_COPY_BUFFER, 0, 0)); + auto node1 = event->getMultiRootTimestampSyncNode(); + auto node2 = event->getMultiRootTimestampSyncNode(); + auto node3 = event->getMultiRootTimestampSyncNode(); + event->multiRootDeviceTimestampPacketContainer.reset(new TimestampPacketContainer()); + ASSERT_NE(nullptr, event); + + cl_event clEvent = event.get(); + EventsRequest eventsRequest(0, nullptr, &clEvent); + uint32_t streamBuffer[100] = {}; + NEO::LinearStream linearStream(streamBuffer, sizeof(streamBuffer)); + CsrDependencies deps = {}; + myCmdQ->processDispatchForMarkerWithTimestampPacket(*myCmdQ.get(), &linearStream, eventsRequest, deps); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, ptrOffset(linearStream.getCpuBase(), 0), linearStream.getUsed())); + auto itor = find(cmdList.begin(), cmdList.end()); + bool notExpectedAddressInPipeControl = true; + while (itor != cmdList.end()) { + auto pipeControlCmd = reinterpret_cast(*itor); + uint64_t addressHigh = pipeControlCmd->getAddressHigh(); + uint64_t addressLow = pipeControlCmd->getAddress(); + addressHigh = addressHigh << 32; + uint64_t address = addressHigh | addressLow; + if (address == node1->getGpuAddress() + node1->getContextEndOffset() || + address == node2->getGpuAddress() + node2->getContextEndOffset() || + address == node3->getGpuAddress() + node3->getContextEndOffset()) { + notExpectedAddressInPipeControl = false; + break; + } + itor = find(++itor, cmdList.end()); + } + EXPECT_TRUE(notExpectedAddressInPipeControl); +} + +HWTEST_F(CommandQueueHwTest, GivenEventRequestWithoutOutEventWhenProcessDispatchForMarkerWithTimestampPacketCalledThenSyncPipeControlIsNotProgrammed) { + + pDevice->getUltCommandStreamReceiver().timestampPacketWriteEnabled = false; + MockCommandStreamReceiverWithFailingFlushBatchedSubmission csr(*pDevice->getExecutionEnvironment(), 0, pDevice->getDeviceBitfield()); + auto mockTagAllocator = std::make_unique>(pCmdQ->getContextPtr()->getRootDeviceIndices(), pDevice->getExecutionEnvironment()->memoryManager.get(), 10u); + std::unique_ptr uniquePtr(mockTagAllocator.release()); + pCmdQ->getContextPtr()->setMultiRootDeviceTimestampPacketAllocator(uniquePtr); + auto myCmdQ = std::make_unique>(pCmdQ->getContextPtr(), pClDevice, nullptr, false); + myCmdQ->csr = &csr; + + myCmdQ->timestampPacketContainer = std::make_unique(); + myCmdQ->timestampPacketContainer->add(pCmdQ->getContextPtr()->getMultiRootDeviceTimestampPacketAllocator()->getTag()); + + csr.osContext = &pCmdQ->getGpgpuCommandStreamReceiver().getOsContext(); + std::unique_ptr> event(new MockEvent(myCmdQ.get(), CL_COMMAND_COPY_BUFFER, 0, 0)); + auto node1 = event->getMultiRootTimestampSyncNode(); + auto node2 = event->getMultiRootTimestampSyncNode(); + auto node3 = event->getMultiRootTimestampSyncNode(); + ASSERT_NE(nullptr, event); + + EventsRequest eventsRequest(0, nullptr, nullptr); + uint32_t streamBuffer[100] = {}; + NEO::LinearStream linearStream(streamBuffer, sizeof(streamBuffer)); + CsrDependencies deps = {}; + myCmdQ->processDispatchForMarkerWithTimestampPacket(*myCmdQ.get(), &linearStream, eventsRequest, deps); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, ptrOffset(linearStream.getCpuBase(), 0), linearStream.getUsed())); + auto itor = find(cmdList.begin(), cmdList.end()); + bool notExpectedAddressInPipeControl = true; + while (itor != cmdList.end()) { + auto pipeControlCmd = reinterpret_cast(*itor); + uint64_t addressHigh = pipeControlCmd->getAddressHigh(); + uint64_t addressLow = pipeControlCmd->getAddress(); + addressHigh = addressHigh << 32; + uint64_t address = addressHigh | addressLow; + if (address == node1->getGpuAddress() + node1->getContextEndOffset() || + address == node2->getGpuAddress() + node2->getContextEndOffset() || + address == node3->getGpuAddress() + node3->getContextEndOffset()) { + notExpectedAddressInPipeControl = false; + break; + } + itor = find(++itor, cmdList.end()); + } + EXPECT_TRUE(notExpectedAddressInPipeControl); +} + HWTEST_F(CommandQueueHwTest, GivenCommandQueueWhenItIsCreatedThenInitDirectSubmissionIsCalledOnAllBcsEngines) { MockCommandQueueHw queue(pContext, pClDevice, nullptr); for (auto engine : queue.bcsEngines) { diff --git a/opencl/test/unit_test/command_queue/command_queue_hw_2_tests.cpp b/opencl/test/unit_test/command_queue/command_queue_hw_2_tests.cpp index 61820e0fdd..ecdb73218b 100644 --- a/opencl/test/unit_test/command_queue/command_queue_hw_2_tests.cpp +++ b/opencl/test/unit_test/command_queue/command_queue_hw_2_tests.cpp @@ -20,6 +20,7 @@ #include "opencl/test/unit_test/mocks/mock_command_queue.h" #include "opencl/test/unit_test/mocks/mock_event.h" #include "opencl/test/unit_test/mocks/mock_kernel.h" +#include "opencl/test/unit_test/mocks/mock_mdi.h" using namespace NEO; @@ -276,7 +277,7 @@ HWTEST_F(OOQueueHwTest, givenBlockedOutOfOrderCmdQueueAndAsynchronouslyCompleted cmdQHw->taskLevel = 23; cmdQHw->enqueueKernel(mockKernel, 1, &offset, &size, &size, 1, &blockedEvent, nullptr); - //new virtual event is created on enqueue, bind it to the created virtual event + // new virtual event is created on enqueue, bind it to the created virtual event EXPECT_NE(cmdQHw->virtualEvent, &virtualEvent); event.setStatus(CL_SUBMITTED); @@ -285,7 +286,7 @@ HWTEST_F(OOQueueHwTest, givenBlockedOutOfOrderCmdQueueAndAsynchronouslyCompleted EXPECT_FALSE(cmdQHw->isQueueBlocked()); //+1 due to dependency between virtual event & new virtual event - //new virtual event is actually responsible for command delivery + // new virtual event is actually responsible for command delivery EXPECT_EQ(virtualEventTaskLevel + 1, cmdQHw->taskLevel); EXPECT_EQ(virtualEventTaskLevel + 1, mockCSR->lastTaskLevelToFlushTask); } @@ -970,3 +971,90 @@ HWTEST_F(CommandQueueHwTest, GivenBuiltinKernelWhenBuiltinDispatchInfoBuilderIsP EXPECT_EQ(builder.paramsToUse.offset.x, dispatchInfo->getOffset().x); EXPECT_EQ(builder.paramsToUse.kernel, dispatchInfo->getKernel()); } +HWTEST_F(CommandQueueHwTest, GivenMultiRootDeviceSyncEventWhenProcessDispatchForKernelsThenSyncNodeSignaledByPipeControll) { + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + MockDefaultContext context{true}; + std::unique_ptr pCmdQ1(createCommandQueue(context.getDevice(0), nullptr, &context)); + CommandQueueHw *cmdQHw = static_cast *>(pCmdQ1.get()); + MockKernelWithInternals mockKernelWithInternals(*context.getDevice(0), &context); + + MockMultiDispatchInfo multiDispatchInfo(pClDevice, std::vector({mockKernelWithInternals.mockKernel})); + std::unique_ptr printfHandler; + std::unique_ptr hwTimeStamps = std::make_unique>>(); + TagNodeBase *hwTimeStampsPtr = hwTimeStamps.get(); + bool blockQueue = false; + CsrDependencies csrDeps = {}; + KernelOperation *blockedCommandsData = nullptr; + TimestampPacketDependencies timestampPacketDependencies = {}; + std::unique_ptr> event(new MockEvent(cmdQHw, CL_COMMAND_COPY_BUFFER, 0, 0)); + auto node = event->getMultiRootTimestampSyncNode(); + reinterpret_cast *>(cmdQHw)->timestampPacketContainer.reset(); + reinterpret_cast *>(cmdQHw)->template processDispatchForKernels(multiDispatchInfo, + printfHandler, + event.get(), + hwTimeStampsPtr, + blockQueue, + csrDeps, + blockedCommandsData, + timestampPacketDependencies); + + HardwareParse ccsHwParser; + ccsHwParser.parseCommands(cmdQHw->getCS(0), 0u); + + auto pipeControlItor = find(ccsHwParser.cmdList.begin(), ccsHwParser.cmdList.end()); + bool expectedAddressInPipeControl = false; + while (pipeControlItor != ccsHwParser.cmdList.end()) { + auto pipeControlCmd = reinterpret_cast(*pipeControlItor); + uint64_t addressHigh = pipeControlCmd->getAddressHigh(); + uint64_t addressLow = pipeControlCmd->getAddress(); + addressHigh = addressHigh << 32; + uint64_t address = addressHigh | addressLow; + if (address == node->getGpuAddress() + node->getContextEndOffset()) { + expectedAddressInPipeControl = true; + break; + } + pipeControlItor = find(++pipeControlItor, ccsHwParser.cmdList.end()); + } + EXPECT_TRUE(expectedAddressInPipeControl); +} +HWTEST_F(CommandQueueHwTest, GivenMultiRootDeviceSyncEventWithEmptyDeviceSyncContainerWhenProcessDispatchForKernelsThenSyncNodeNotSignalled) { + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + MockDefaultContext context{true}; + std::unique_ptr pCmdQ1(createCommandQueue(context.getDevice(0), nullptr, &context)); + CommandQueueHw *cmdQHw = static_cast *>(pCmdQ1.get()); + MockKernelWithInternals mockKernelWithInternals(*context.getDevice(0), &context); + + MockMultiDispatchInfo multiDispatchInfo(pClDevice, std::vector({mockKernelWithInternals.mockKernel})); + std::unique_ptr printfHandler; + std::unique_ptr hwTimeStamps = std::make_unique>>(); + TagNodeBase *hwTimeStampsPtr = hwTimeStamps.get(); + bool blockQueue = false; + CsrDependencies csrDeps = {}; + KernelOperation *blockedCommandsData = nullptr; + TimestampPacketDependencies timestampPacketDependencies = {}; + std::unique_ptr> event(new MockEvent(cmdQHw, CL_COMMAND_COPY_BUFFER, 0, 0)); + auto node = event->getMultiRootTimestampSyncNode(); + node->incRefCount(); + event->multiRootDeviceTimestampPacketContainer = std::make_unique(); + reinterpret_cast *>(cmdQHw)->timestampPacketContainer.reset(); + reinterpret_cast *>(cmdQHw)->template processDispatchForKernels(multiDispatchInfo, printfHandler, event.get(), hwTimeStampsPtr, blockQueue, csrDeps, blockedCommandsData, timestampPacketDependencies); + HardwareParse ccsHwParser; + ccsHwParser.parseCommands(cmdQHw->getCS(0), 0u); + + auto pipeControlItor = find(ccsHwParser.cmdList.begin(), ccsHwParser.cmdList.end()); + bool expectedAddressInPipeControl = false; + while (pipeControlItor != ccsHwParser.cmdList.end()) { + auto pipeControlCmd = reinterpret_cast(*pipeControlItor); + uint64_t addressHigh = pipeControlCmd->getAddressHigh(); + uint64_t addressLow = pipeControlCmd->getAddress(); + addressHigh = addressHigh << 32; + uint64_t address = addressHigh | addressLow; + if (address == node->getGpuAddress() + node->getContextEndOffset()) { + expectedAddressInPipeControl = true; + break; + } + pipeControlItor = find(++pipeControlItor, ccsHwParser.cmdList.end()); + } + EXPECT_FALSE(expectedAddressInPipeControl); + node->returnTag(); +} \ No newline at end of file diff --git a/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp b/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp index 962e2a4a5a..1e0f618e58 100644 --- a/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp +++ b/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp @@ -29,6 +29,7 @@ #include "opencl/test/unit_test/fixtures/cl_device_fixture.h" #include "opencl/test/unit_test/mocks/mock_buffer.h" #include "opencl/test/unit_test/mocks/mock_command_queue.h" +#include "opencl/test/unit_test/mocks/mock_event.h" #include "opencl/test/unit_test/mocks/mock_kernel.h" #include "opencl/test/unit_test/mocks/mock_mdi.h" #include "opencl/test/unit_test/mocks/mock_program.h" diff --git a/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_and_later.cpp b/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_and_later.cpp index 10d0e767c2..e080f6d16a 100644 --- a/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_and_later.cpp +++ b/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_and_later.cpp @@ -557,11 +557,11 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, givenTimestamp MockMultiDispatchInfo multiDispatchInfo(device.get(), std::vector({kernel1.mockKernel, kernel2.mockKernel})); device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = false; - getCommandStream(cmdQ, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false); + getCommandStream(cmdQ, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false, nullptr); size_t sizeWithDisabled = cmdQ.requestedCmdStreamSize; device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; - getCommandStream(cmdQ, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false); + getCommandStream(cmdQ, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false, nullptr); size_t sizeWithEnabled = cmdQ.requestedCmdStreamSize; size_t additionalSize = 0u; @@ -669,7 +669,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, givenAutoLocal EXPECT_EQ((uint32_t)(expectedKernelStartOffset), idd.getKernelStartPointer()); auto expectedSizeCS = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, CsrDependencies(), false, false, - false, *cmdQ.get(), multiDispatchInfo, false, false); + false, *cmdQ.get(), multiDispatchInfo, false, false, nullptr); expectedSizeCS += sizeof(typename FamilyType::MI_BATCH_BUFFER_END); expectedSizeCS = alignUp(expectedSizeCS, MemoryConstants::cacheLineSize); EXPECT_GE(expectedSizeCS, usedAfterCS - usedBeforeCS); @@ -738,7 +738,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, givenPassInlin EXPECT_EQ((uint32_t)(expectedKernelStartOffset), idd.getKernelStartPointer()); auto expectedSizeCS = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, CsrDependencies(), false, false, - false, *cmdQ.get(), multiDispatchInfo, false, false); + false, *cmdQ.get(), multiDispatchInfo, false, false, nullptr); expectedSizeCS += sizeof(typename FamilyType::MI_BATCH_BUFFER_END); expectedSizeCS = alignUp(expectedSizeCS, MemoryConstants::cacheLineSize); EXPECT_GE(expectedSizeCS, usedAfterCS - usedBeforeCS); diff --git a/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp index 9bbee21f1b..ab44c67049 100644 --- a/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp @@ -233,7 +233,7 @@ HWTEST_F(EnqueueHandlerTest, givenNonBlitPropertyWhenEnqueueIsBlockedThenDontReg Surface *surfaces[] = {nullptr}; mockCmdQ->enqueueBlocked(CL_COMMAND_MARKER, surfaces, size_t(0), multiDispatchInfo, timestampPacketDependencies, blockedCommandsData, enqueuePropertiesForDependencyFlush, eventsRequest, - eventBuilder, std::unique_ptr(nullptr), nullptr); + eventBuilder, std::unique_ptr(nullptr), nullptr, nullptr); EXPECT_FALSE(blockedCommandsDataForDependencyFlush->blitEnqueue); } @@ -266,7 +266,7 @@ HWTEST_F(EnqueueHandlerTest, givenBlitPropertyWhenEnqueueIsBlockedThenRegisterBl Surface *surfaces[] = {nullptr}; mockCmdQ->enqueueBlocked(CL_COMMAND_READ_BUFFER, surfaces, size_t(0), multiDispatchInfo, timestampPacketDependencies, blockedCommandsData, enqueuePropertiesForBlitEnqueue, eventsRequest, - eventBuilder, std::unique_ptr(nullptr), mockCmdQ->getBcsForAuxTranslation()); + eventBuilder, std::unique_ptr(nullptr), mockCmdQ->getBcsForAuxTranslation(), nullptr); EXPECT_TRUE(blockedCommandsDataForBlitEnqueue->blitEnqueue); EXPECT_EQ(blitProperties.srcAllocation, blockedCommandsDataForBlitEnqueue->blitPropertiesContainer.begin()->srcAllocation); EXPECT_EQ(blitProperties.dstAllocation, blockedCommandsDataForBlitEnqueue->blitPropertiesContainer.begin()->dstAllocation); @@ -350,7 +350,7 @@ HWTEST_F(DispatchFlagsBlitTests, givenBlitEnqueueWhenDispatchingCommandsWithoutK timestampPacketDependencies.cacheFlushNodes.add(mockCmdQ->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator()->getTag()); BlitProperties blitProperties = mockCmdQ->processDispatchForBlitEnqueue(bcsCsr, multiDispatchInfo, timestampPacketDependencies, - eventsRequest, &mockCmdQ->getCS(0), CL_COMMAND_READ_BUFFER, false); + eventsRequest, &mockCmdQ->getCS(0), CL_COMMAND_READ_BUFFER, false, nullptr); BlitPropertiesContainer blitPropertiesContainer; blitPropertiesContainer.push_back(blitProperties); @@ -394,7 +394,7 @@ HWTEST_F(DispatchFlagsBlitTests, givenN1EnabledWhenDispatchingWithoutKernelThenA mockCmdQ->obtainNewTimestampPacketNodes(1, timestampPacketDependencies.previousEnqueueNodes, true, bcsCsr); timestampPacketDependencies.cacheFlushNodes.add(mockCmdQ->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator()->getTag()); BlitProperties blitProperties = mockCmdQ->processDispatchForBlitEnqueue(bcsCsr, multiDispatchInfo, timestampPacketDependencies, - eventsRequest, &mockCmdQ->getCS(0), CL_COMMAND_READ_BUFFER, false); + eventsRequest, &mockCmdQ->getCS(0), CL_COMMAND_READ_BUFFER, false, nullptr); BlitPropertiesContainer blitPropertiesContainer; blitPropertiesContainer.push_back(blitProperties); @@ -440,7 +440,7 @@ HWTEST_F(DispatchFlagsTests, givenMockKernelWhenSettingAdditionalKernelExecInfoT std::vector v; pKernel->setAdditionalKernelExecInfo(123u); - std::unique_ptr cmd(new CommandComputeKernel(*mockCmdQ.get(), blockedCommandsData, v, false, false, false, std::move(printfHandler), PreemptionMode::Disabled, pKernel, 1)); + std::unique_ptr cmd(new CommandComputeKernel(*mockCmdQ.get(), blockedCommandsData, v, false, false, false, std::move(printfHandler), PreemptionMode::Disabled, pKernel, 1, nullptr)); cmd->submit(1u, false); EXPECT_EQ(mockCsr->passedDispatchFlags.additionalKernelExecInfo, 123u); diff --git a/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests_dg2_and_later.cpp b/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests_dg2_and_later.cpp index 07d5dc7321..d794aa5338 100644 --- a/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests_dg2_and_later.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests_dg2_and_later.cpp @@ -53,7 +53,7 @@ HWTEST2_F(DispatchFlagsTests, whenSubmittingKernelWithAdditionalKernelExecInfoTh std::vector v; pKernel->setAdditionalKernelExecInfo(AdditionalKernelExecInfo::DisableOverdispatch); - std::unique_ptr cmd(new CommandComputeKernel(*mockCmdQ.get(), blockedCommandsData, v, false, false, false, std::move(printfHandler), PreemptionMode::Disabled, pKernel, 1)); + std::unique_ptr cmd(new CommandComputeKernel(*mockCmdQ.get(), blockedCommandsData, v, false, false, false, std::move(printfHandler), PreemptionMode::Disabled, pKernel, 1, nullptr)); cmd->submit(1u, false); EXPECT_EQ(mockCsr->passedDispatchFlags.additionalKernelExecInfo, AdditionalKernelExecInfo::DisableOverdispatch); diff --git a/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp index e4e53386b4..aed5c4b56e 100644 --- a/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp @@ -2002,10 +2002,10 @@ HWTEST_F(PauseOnGpuTests, givenGpuScratchWriteEnabledWhenEstimatingCommandStream dispatchInfo.setKernel(mockKernel.mockKernel); multiDispatchInfo.push(dispatchInfo); - auto baseCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false); + auto baseCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, nullptr); DebugManager.flags.GpuScratchRegWriteAfterWalker.set(1); - auto extendedCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false); + auto extendedCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, nullptr); EXPECT_EQ(baseCommandStreamSize + sizeof(typename FamilyType::MI_LOAD_REGISTER_IMM), extendedCommandStreamSize); } diff --git a/opencl/test/unit_test/command_queue/enqueue_kernel_2_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_kernel_2_tests.cpp index b3182664df..57ff16afe5 100644 --- a/opencl/test/unit_test/command_queue/enqueue_kernel_2_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_kernel_2_tests.cpp @@ -1010,8 +1010,8 @@ HWTEST_F(EnqueueKernelTest, givenTimestampWriteEnableWhenMarkerProfilingWithoutW dispatchInfo.setKernel(mockKernel.mockKernel); multiDispatchInfo.push(dispatchInfo); - auto baseCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false); - auto extendedCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, true, false); + auto baseCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, nullptr); + auto extendedCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, true, false, nullptr); EXPECT_EQ(baseCommandStreamSize + 4 * EncodeStoreMMIO::size + MemorySynchronizationCommands::getSizeForSingleBarrier(false), extendedCommandStreamSize); } @@ -1029,8 +1029,8 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, EnqueueKernelTest, givenTimestampWriteEnableOnMulti dispatchInfo.setKernel(mockKernel.mockKernel); multiDispatchInfo.push(dispatchInfo); - auto baseCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false); - auto extendedCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, true, false); + auto baseCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, nullptr); + auto extendedCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, true, false, nullptr); EXPECT_EQ(baseCommandStreamSize + 4 * EncodeStoreMMIO::size + ImplicitScalingDispatch::getBarrierSize(csr.peekHwInfo(), false, false), extendedCommandStreamSize); } @@ -1043,8 +1043,8 @@ HWTEST_F(EnqueueKernelTest, givenTimestampWriteEnableWhenMarkerProfilingWithWait dispatchInfo.setKernel(mockKernel.mockKernel); multiDispatchInfo.push(dispatchInfo); - auto baseCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false); - auto extendedCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, true, true); + auto baseCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, nullptr); + auto extendedCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, true, true, nullptr); EXPECT_EQ(baseCommandStreamSize + 4 * EncodeStoreMMIO::size, extendedCommandStreamSize); } diff --git a/opencl/test/unit_test/command_queue/get_size_required_buffer_tests.cpp b/opencl/test/unit_test/command_queue/get_size_required_buffer_tests.cpp index 68e1038d82..a7c9169e62 100644 --- a/opencl/test/unit_test/command_queue/get_size_required_buffer_tests.cpp +++ b/opencl/test/unit_test/command_queue/get_size_required_buffer_tests.cpp @@ -23,6 +23,7 @@ #include "opencl/test/unit_test/fixtures/hello_world_kernel_fixture.h" #include "opencl/test/unit_test/fixtures/image_fixture.h" #include "opencl/test/unit_test/fixtures/simple_arg_kernel_fixture.h" +#include "opencl/test/unit_test/mocks/mock_event.h" using namespace NEO; @@ -96,7 +97,7 @@ HWTEST_F(GetSizeRequiredBufferTest, WhenFillingBufferThenHeapsAndCommandBufferCo auto usedAfterSSH = ssh.getUsed(); auto expectedSizeCS = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_FILL_BUFFER, CsrDependencies(), false, false, - false, *pCmdQ, multiDispatchInfo, false, false); + false, *pCmdQ, multiDispatchInfo, false, false, nullptr); auto expectedSizeDSH = HardwareCommandsHelper::getTotalSizeRequiredDSH(multiDispatchInfo); auto expectedSizeIOH = HardwareCommandsHelper::getTotalSizeRequiredIOH(multiDispatchInfo); auto expectedSizeSSH = HardwareCommandsHelper::getTotalSizeRequiredSSH(multiDispatchInfo); @@ -149,7 +150,7 @@ HWTEST_F(GetSizeRequiredBufferTest, WhenCopyingBufferThenHeapsAndCommandBufferCo auto usedAfterSSH = ssh.getUsed(); auto expectedSizeCS = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_COPY_BUFFER, CsrDependencies(), false, false, - false, *pCmdQ, multiDispatchInfo, false, false); + false, *pCmdQ, multiDispatchInfo, false, false, nullptr); auto expectedSizeDSH = HardwareCommandsHelper::getTotalSizeRequiredDSH(multiDispatchInfo); auto expectedSizeIOH = HardwareCommandsHelper::getTotalSizeRequiredIOH(multiDispatchInfo); auto expectedSizeSSH = HardwareCommandsHelper::getTotalSizeRequiredSSH(multiDispatchInfo); @@ -203,7 +204,7 @@ HWTEST_F(GetSizeRequiredBufferTest, WhenReadingBufferNonBlockingThenHeapsAndComm auto usedAfterSSH = ssh.getUsed(); auto expectedSizeCS = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_READ_BUFFER, CsrDependencies(), false, false, - false, *pCmdQ, multiDispatchInfo, false, false); + false, *pCmdQ, multiDispatchInfo, false, false, nullptr); auto expectedSizeDSH = HardwareCommandsHelper::getTotalSizeRequiredDSH(multiDispatchInfo); auto expectedSizeIOH = HardwareCommandsHelper::getTotalSizeRequiredIOH(multiDispatchInfo); auto expectedSizeSSH = HardwareCommandsHelper::getTotalSizeRequiredSSH(multiDispatchInfo); @@ -258,7 +259,7 @@ HWTEST_F(GetSizeRequiredBufferTest, WhenReadingBufferBlockingThenThenHeapsAndCom auto usedAfterSSH = ssh.getUsed(); auto expectedSizeCS = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_READ_BUFFER, CsrDependencies(), false, false, - false, *pCmdQ, multiDispatchInfo, false, false); + false, *pCmdQ, multiDispatchInfo, false, false, nullptr); auto expectedSizeDSH = HardwareCommandsHelper::getTotalSizeRequiredDSH(multiDispatchInfo); auto expectedSizeIOH = HardwareCommandsHelper::getTotalSizeRequiredIOH(multiDispatchInfo); auto expectedSizeSSH = HardwareCommandsHelper::getTotalSizeRequiredSSH(multiDispatchInfo); @@ -313,7 +314,7 @@ HWTEST_F(GetSizeRequiredBufferTest, WhenWritingBufferNonBlockingThenHeapsAndComm auto usedAfterSSH = ssh.getUsed(); auto expectedSizeCS = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_WRITE_BUFFER, CsrDependencies(), false, false, - false, *pCmdQ, multiDispatchInfo, false, false); + false, *pCmdQ, multiDispatchInfo, false, false, nullptr); auto expectedSizeDSH = HardwareCommandsHelper::getTotalSizeRequiredDSH(multiDispatchInfo); auto expectedSizeIOH = HardwareCommandsHelper::getTotalSizeRequiredIOH(multiDispatchInfo); auto expectedSizeSSH = HardwareCommandsHelper::getTotalSizeRequiredSSH(multiDispatchInfo); @@ -365,7 +366,7 @@ HWTEST_F(GetSizeRequiredBufferTest, WhenWritingBufferBlockingThenHeapsAndCommand auto usedAfterSSH = ssh.getUsed(); auto expectedSizeCS = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_WRITE_BUFFER, CsrDependencies(), false, false, - false, *pCmdQ, multiDispatchInfo, false, false); + false, *pCmdQ, multiDispatchInfo, false, false, nullptr); auto expectedSizeDSH = HardwareCommandsHelper::getTotalSizeRequiredDSH(multiDispatchInfo); auto expectedSizeIOH = HardwareCommandsHelper::getTotalSizeRequiredIOH(multiDispatchInfo); auto expectedSizeSSH = HardwareCommandsHelper::getTotalSizeRequiredSSH(multiDispatchInfo); @@ -380,6 +381,68 @@ HWTEST_F(GetSizeRequiredBufferTest, WhenWritingBufferBlockingThenHeapsAndCommand EXPECT_GE(expectedSizeSSH, usedAfterSSH - usedBeforeSSH); } +HWTEST_F(GetSizeRequiredBufferTest, GivenOutEventForSingleDeviceContextWhenCalculatingCSSizeThenExtraPipeControlIsNotAdded) { + UltClDeviceFactory deviceFactory{1, 0}; + DebugManager.flags.EnableMultiRootDeviceContexts.set(true); + + cl_device_id devices[] = {deviceFactory.rootDevices[0]}; + + MockContext pContext(ClDeviceVector(devices, 1)); + MockKernelWithInternals mockKernel(*pContext.getDevices()[0]); + DispatchInfo dispatchInfo; + MultiDispatchInfo multiDispatchInfo(mockKernel.mockKernel); + dispatchInfo.setKernel(mockKernel.mockKernel); + multiDispatchInfo.push(dispatchInfo); + auto event = std::make_unique>(&pContext, nullptr, 0, 0, 0); + cl_event clEvent = event.get(); + auto baseCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, nullptr); + auto extendedCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, &clEvent); + + EXPECT_EQ(baseCommandStreamSize, extendedCommandStreamSize); +} + +HWTEST_F(GetSizeRequiredBufferTest, GivenUserEventForMultiDeviceContextWhenCalculatingCSSizeThenExtraPipeControlIsNotAdded) { + UltClDeviceFactory deviceFactory{2, 0}; + DebugManager.flags.EnableMultiRootDeviceContexts.set(true); + + cl_device_id devices[] = {deviceFactory.rootDevices[0], + deviceFactory.rootDevices[1]}; + + MockContext pContext(ClDeviceVector(devices, 2)); + MockKernelWithInternals mockKernel(*pContext.getDevices()[0]); + DispatchInfo dispatchInfo; + MultiDispatchInfo multiDispatchInfo(mockKernel.mockKernel); + dispatchInfo.setKernel(mockKernel.mockKernel); + multiDispatchInfo.push(dispatchInfo); + auto userEvent1 = std::make_unique(&pContext); + cl_event clEvent = userEvent1.get(); + auto baseCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, nullptr); + auto extendedCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, &clEvent); + + EXPECT_EQ(baseCommandStreamSize, extendedCommandStreamSize); +} + +HWTEST_F(GetSizeRequiredBufferTest, GivenOutEventForMultiDeviceContextWhenCalculatingCSSizeThenExtraPipeControlIsAdded) { + UltClDeviceFactory deviceFactory{2, 0}; + DebugManager.flags.EnableMultiRootDeviceContexts.set(true); + + cl_device_id devices[] = {deviceFactory.rootDevices[0], + deviceFactory.rootDevices[1]}; + + MockContext pContext(ClDeviceVector(devices, 2)); + MockKernelWithInternals mockKernel(*pContext.getDevices()[0]); + DispatchInfo dispatchInfo; + MultiDispatchInfo multiDispatchInfo(mockKernel.mockKernel); + dispatchInfo.setKernel(mockKernel.mockKernel); + multiDispatchInfo.push(dispatchInfo); + auto event = std::make_unique>(&pContext, nullptr, 0, 0, 0); + cl_event clEvent = event.get(); + auto baseCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, nullptr); + auto extendedCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, &clEvent); + + EXPECT_EQ(baseCommandStreamSize + MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(pContext.getDevices()[0]->getHardwareInfo(), false), extendedCommandStreamSize); +} + HWTEST_F(GetSizeRequiredBufferTest, givenMultipleKernelRequiringSshWhenTotalSizeIsComputedThenItIsProperlyAligned) { auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(EBuiltInOps::CopyBufferToBuffer, pCmdQ->getClDevice()); diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_3_tests.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_3_tests.cpp index fb00196b5a..5602b17916 100644 --- a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_3_tests.cpp +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_3_tests.cpp @@ -1902,7 +1902,7 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, GivenBlockedKernelWhenItIsUnblocke blockedCommandsData->setHeaps(dsh, ioh, ssh); std::vector surfaces; - event->setCommand(std::make_unique(*pCmdQ, blockedCommandsData, surfaces, false, false, false, nullptr, pDevice->getPreemptionMode(), pKernel, 1)); + event->setCommand(std::make_unique(*pCmdQ, blockedCommandsData, surfaces, false, false, false, nullptr, pDevice->getPreemptionMode(), pKernel, 1, nullptr)); event->submitCommand(false); EXPECT_EQ(numGrfRequired, csr->savedDispatchFlags.numGrfRequired); @@ -1947,7 +1947,7 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, GivenBlockedKernelWhenInitializeBc auto blockedCommandsData = std::make_unique(cmdStream, *pCmdQ->getGpgpuCommandStreamReceiver().getInternalAllocationStorage()); std::vector surfaces; - event->setCommand(std::make_unique(*pCmdQ, blockedCommandsData, surfaces, false, false, false, nullptr, pDevice->getPreemptionMode(), pKernel, 1)); + event->setCommand(std::make_unique(*pCmdQ, blockedCommandsData, surfaces, false, false, false, nullptr, pDevice->getPreemptionMode(), pKernel, 1, nullptr)); event->submitCommand(false); EXPECT_FALSE(pCmdQ->isCsrLocked); } diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_4_tests.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_4_tests.cpp index 87fb538093..c7bac440f4 100644 --- a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_4_tests.cpp +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_4_tests.cpp @@ -7,12 +7,14 @@ #include "shared/source/command_stream/wait_status.h" #include "shared/test/common/mocks/mock_command_stream_receiver.h" +#include "shared/test/common/mocks/mock_timestamp_container.h" #include "shared/test/common/mocks/ult_device_factory.h" #include "shared/test/common/test_macros/hw_test.h" #include "opencl/source/event/user_event.h" #include "opencl/test/unit_test/fixtures/multi_root_device_fixture.h" #include "opencl/test/unit_test/fixtures/ult_command_stream_receiver_fixture.h" +#include "opencl/test/unit_test/mocks/mock_event.h" #include "opencl/test/unit_test/mocks/mock_kernel.h" #include "opencl/test/unit_test/mocks/mock_program.h" #include "opencl/test/unit_test/test_macros/test_checks_ocl.h" @@ -44,12 +46,18 @@ HWTEST_F(MultiRootDeviceCommandStreamReceiverBufferTests, givenMultipleEventInMu MockGraphicsAllocation svmAlloc(svmPtr, svmSize); Event event1(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 5, 15); + auto node1 = event1.getMultiRootTimestampSyncNode(); Event event2(nullptr, CL_COMMAND_NDRANGE_KERNEL, 6, 16); Event event3(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 4, 20); + auto node3 = event3.getMultiRootTimestampSyncNode(); Event event4(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 3, 4); + auto node4 = event4.getMultiRootTimestampSyncNode(); Event event5(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 2, 7); + auto node5 = event5.getMultiRootTimestampSyncNode(); UserEvent userEvent1(&pCmdQ1->getContext()); + userEvent1.getMultiRootTimestampSyncNode(); UserEvent userEvent2(&pCmdQ2->getContext()); + userEvent2.getMultiRootTimestampSyncNode(); userEvent1.setStatus(CL_COMPLETE); userEvent2.setStatus(CL_COMPLETE); @@ -86,12 +94,12 @@ HWTEST_F(MultiRootDeviceCommandStreamReceiverBufferTests, givenMultipleEventInMu EXPECT_EQ(2u, semaphores.size()); auto semaphoreCmd0 = genCmdCast(*(semaphores[0])); - EXPECT_EQ(4u, semaphoreCmd0->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ2->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd0->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(node4->getContextEndAddress(0u)), semaphoreCmd0->getSemaphoreGraphicsAddress()); auto semaphoreCmd1 = genCmdCast(*(semaphores[1])); - EXPECT_EQ(7u, semaphoreCmd1->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ2->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd1->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd1->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(node5->getContextEndAddress(0u)), semaphoreCmd1->getSemaphoreGraphicsAddress()); } { @@ -114,12 +122,12 @@ HWTEST_F(MultiRootDeviceCommandStreamReceiverBufferTests, givenMultipleEventInMu EXPECT_EQ(2u, semaphores.size()); auto semaphoreCmd0 = genCmdCast(*(semaphores[0])); - EXPECT_EQ(15u, semaphoreCmd0->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd0->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(node1->getContextEndAddress(0u)), semaphoreCmd0->getSemaphoreGraphicsAddress()); auto semaphoreCmd1 = genCmdCast(*(semaphores[1])); - EXPECT_EQ(20u, semaphoreCmd1->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd1->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd1->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(node3->getContextEndAddress(0u)), semaphoreCmd1->getSemaphoreGraphicsAddress()); } alignedFree(svmPtr); } @@ -146,17 +154,24 @@ HWTEST_F(MultiRootDeviceCommandStreamReceiverTests, givenMultipleEventInMultiRoo cl_device_id devices[] = {device1, device2, device3}; auto context = std::make_unique(ClDeviceVector(devices, 3), false); - + auto mockTagAllocator = std::make_unique>(context->getRootDeviceIndices(), device1->getExecutionEnvironment()->memoryManager.get(), 10u); + std::unique_ptr uniquePtr(mockTagAllocator.release()); + context->setMultiRootDeviceTimestampPacketAllocator(uniquePtr); auto pCmdQ1 = context->getSpecialQueue(1u); auto pCmdQ2 = context->getSpecialQueue(2u); auto pCmdQ3 = context->getSpecialQueue(3u); Event event1(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 5, 15); + auto node1 = event1.getMultiRootTimestampSyncNode(); Event event2(nullptr, CL_COMMAND_NDRANGE_KERNEL, 6, 16); Event event3(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 4, 20); + auto node3 = event3.getMultiRootTimestampSyncNode(); Event event4(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 3, 4); + auto node4 = event4.getMultiRootTimestampSyncNode(); Event event5(pCmdQ3, CL_COMMAND_NDRANGE_KERNEL, 7, 21); + auto node5 = event5.getMultiRootTimestampSyncNode(); Event event6(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 2, 7); + auto node6 = event6.getMultiRootTimestampSyncNode(); UserEvent userEvent1(&pCmdQ1->getContext()); UserEvent userEvent2(&pCmdQ2->getContext()); @@ -189,16 +204,16 @@ HWTEST_F(MultiRootDeviceCommandStreamReceiverTests, givenMultipleEventInMultiRoo EXPECT_EQ(3u, semaphores.size()); auto semaphoreCmd0 = genCmdCast(*(semaphores[0])); - EXPECT_EQ(4u, semaphoreCmd0->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ2->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd0->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(node4->getContextEndAddress(0u)), semaphoreCmd0->getSemaphoreGraphicsAddress()); auto semaphoreCmd1 = genCmdCast(*(semaphores[1])); - EXPECT_EQ(21u, semaphoreCmd1->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ3->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd1->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd1->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(node5->getContextEndAddress(0u)), semaphoreCmd1->getSemaphoreGraphicsAddress()); auto semaphoreCmd2 = genCmdCast(*(semaphores[2])); - EXPECT_EQ(7u, semaphoreCmd2->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ2->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd2->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd2->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(node6->getContextEndAddress(0u)), semaphoreCmd2->getSemaphoreGraphicsAddress()); } { @@ -214,16 +229,16 @@ HWTEST_F(MultiRootDeviceCommandStreamReceiverTests, givenMultipleEventInMultiRoo EXPECT_EQ(3u, semaphores.size()); auto semaphoreCmd0 = genCmdCast(*(semaphores[0])); - EXPECT_EQ(15u, semaphoreCmd0->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd0->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(node1->getContextEndAddress(0u)), semaphoreCmd0->getSemaphoreGraphicsAddress()); auto semaphoreCmd1 = genCmdCast(*(semaphores[1])); - EXPECT_EQ(20u, semaphoreCmd1->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd1->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd1->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(node3->getContextEndAddress(0u)), semaphoreCmd1->getSemaphoreGraphicsAddress()); auto semaphoreCmd2 = genCmdCast(*(semaphores[2])); - EXPECT_EQ(21u, semaphoreCmd2->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ3->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd2->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd2->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(node5->getContextEndAddress(0u)), semaphoreCmd2->getSemaphoreGraphicsAddress()); } { @@ -248,8 +263,8 @@ HWTEST_F(MultiRootDeviceCommandStreamReceiverTests, givenMultipleEventInMultiRoo EXPECT_EQ(1u, semaphores.size()); auto semaphoreCmd0 = genCmdCast(*(semaphores[0])); - EXPECT_EQ(15u, semaphoreCmd0->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd0->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(node1->getContextEndAddress(0u)), semaphoreCmd0->getSemaphoreGraphicsAddress()); } } @@ -285,11 +300,16 @@ HWTEST_F(CrossDeviceDependenciesTests, givenMultipleEventInMultiRootDeviceEnviro using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; Event event1(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 5, 15); + event1.getMultiRootTimestampSyncNode(); Event event2(nullptr, CL_COMMAND_NDRANGE_KERNEL, 6, 16); Event event3(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 1, 6); + event3.getMultiRootTimestampSyncNode(); Event event4(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 4, 20); + event4.getMultiRootTimestampSyncNode(); Event event5(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 3, 4); + event5.getMultiRootTimestampSyncNode(); Event event6(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 2, 7); + event6.getMultiRootTimestampSyncNode(); UserEvent userEvent1(&pCmdQ1->getContext()); UserEvent userEvent2(&pCmdQ2->getContext()); @@ -315,10 +335,10 @@ HWTEST_F(CrossDeviceDependenciesTests, givenMultipleEventInMultiRootDeviceEnviro EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, nullptr); CsrDependencies csrDeps; - eventsRequest.fillCsrDependenciesForTaskCountContainer(csrDeps, pCmdQ1->getGpgpuCommandStreamReceiver()); + eventsRequest.fillCsrDependenciesForRootDevices(csrDeps, pCmdQ1->getGpgpuCommandStreamReceiver()); - EXPECT_EQ(0u, csrDeps.taskCountContainer.size()); - EXPECT_EQ(0u, TimestampPacketHelper::getRequiredCmdStreamSizeForTaskCountContainer(csrDeps)); + //EXPECT_EQ(0u, csrDeps.taskCountContainer.size()); + EXPECT_EQ(0u, TimestampPacketHelper::getRequiredCmdStreamSizeForMultiRootDeviceSyncNodesContainer(csrDeps)); } { @@ -341,10 +361,10 @@ HWTEST_F(CrossDeviceDependenciesTests, givenMultipleEventInMultiRootDeviceEnviro EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, nullptr); CsrDependencies csrDeps; - eventsRequest.fillCsrDependenciesForTaskCountContainer(csrDeps, pCmdQ2->getGpgpuCommandStreamReceiver()); + eventsRequest.fillCsrDependenciesForRootDevices(csrDeps, pCmdQ2->getGpgpuCommandStreamReceiver()); - EXPECT_EQ(3u, csrDeps.taskCountContainer.size()); - EXPECT_EQ(3u * sizeof(MI_SEMAPHORE_WAIT), TimestampPacketHelper::getRequiredCmdStreamSizeForTaskCountContainer(csrDeps)); + EXPECT_EQ(3u, csrDeps.multiRootTimeStampSyncContainer.size()); + EXPECT_EQ(3u * sizeof(MI_SEMAPHORE_WAIT), TimestampPacketHelper::getRequiredCmdStreamSizeForMultiRootDeviceSyncNodesContainer(csrDeps)); } } @@ -404,8 +424,6 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW EXPECT_EQ(0u, semaphores.size()); } userEvent1.setStatus(CL_COMPLETE); - event1->release(); - event2->release(); pCmdQ1->finish(); pCmdQ2->finish(); { @@ -416,7 +434,7 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW EXPECT_EQ(1u, semaphores.size()); auto semaphoreCmd = genCmdCast(*(semaphores[0])); EXPECT_EQ(1u, semaphoreCmd->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ2->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd->getSemaphoreGraphicsAddress()); + EXPECT_EQ(reinterpret_cast(event2->getMultiRootDeviceTimestampPacketNodes()->peekNodes().at(0)->getContextEndAddress(0u)), semaphoreCmd->getSemaphoreGraphicsAddress()); } { HardwareParse csHwParser; @@ -425,9 +443,11 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW EXPECT_EQ(1u, semaphores.size()); auto semaphoreCmd = genCmdCast(*(semaphores[0])); - EXPECT_EQ(0u, semaphoreCmd->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(event1->getMultiRootDeviceTimestampPacketNodes()->peekNodes().at(0)->getContextEndAddress(0u)), semaphoreCmd->getSemaphoreGraphicsAddress()); } + event1->release(); + event2->release(); buffer->release(); } @@ -589,9 +609,6 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW EXPECT_EQ(0u, semaphores.size()); } userEvent1.setStatus(CL_COMPLETE); - event1->release(); - event2->release(); - event3->release(); pCmdQ1->finish(); pCmdQ2->finish(); @@ -603,7 +620,8 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW EXPECT_EQ(1u, semaphores.size()); auto semaphoreCmd = genCmdCast(*(semaphores[0])); EXPECT_EQ(1u, semaphoreCmd->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ2->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd->getSemaphoreGraphicsAddress()); + auto node = event2->getMultiRootDeviceTimestampPacketNodes()->peekNodes().at(0); + EXPECT_EQ(node->getGpuAddress() + node->getContextEndOffset(), semaphoreCmd->getSemaphoreGraphicsAddress()); } { HardwareParse csHwParser; @@ -619,8 +637,9 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW EXPECT_EQ(2u, semaphores.size()); auto semaphoreCmd0 = genCmdCast(*(semaphores[0])); - EXPECT_EQ(0u, semaphoreCmd0->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd0->getSemaphoreDataDword()); + auto node = event1->getMultiRootDeviceTimestampPacketNodes()->peekNodes().at(0); + EXPECT_EQ(node->getGpuAddress() + node->getContextEndOffset(), semaphoreCmd0->getSemaphoreGraphicsAddress()); } { HardwareParse csHwParser; @@ -629,6 +648,9 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW EXPECT_LE(1u, semaphores.size()); } + event1->release(); + event2->release(); + event3->release(); buffer->release(); pCmdQ1->release(); pCmdQ2->release(); @@ -878,3 +900,72 @@ HWTEST_F(UltCommandStreamReceiverTest, givenDebugDisablingCacheFlushWhenAddingPi EXPECT_FALSE(pipeControl->getConstantCacheInvalidationEnable()); EXPECT_FALSE(pipeControl->getStateCacheInvalidationEnable()); } +HWTEST_F(CrossDeviceDependenciesTests, givenMultipleEventInMultiRootDeviceEnvironmentWhenTheyDoNotHaveMultiRootSyncNodeThenCsrDepsDoesNotHaveAnyMultiRootSyncContainer) { + Event event1(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 5, 15); + Event event2(nullptr, CL_COMMAND_NDRANGE_KERNEL, 6, 16); + Event event3(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 1, 6); + Event event4(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 4, 20); + Event event5(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 3, 4); + Event event6(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 2, 7); + UserEvent userEvent1(&pCmdQ1->getContext()); + UserEvent userEvent2(&pCmdQ2->getContext()); + + userEvent1.setStatus(CL_COMPLETE); + userEvent2.setStatus(CL_COMPLETE); + { + cl_event eventWaitList[] = + { + &event1, + &event2, + &event3, + &event4, + &event5, + &event6, + &userEvent1, + }; + cl_uint numEventsInWaitList = sizeof(eventWaitList) / sizeof(eventWaitList[0]); + + EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, nullptr); + CsrDependencies csrDeps; + eventsRequest.fillCsrDependenciesForRootDevices(csrDeps, pCmdQ2->getGpgpuCommandStreamReceiver()); + + EXPECT_EQ(0u, csrDeps.multiRootTimeStampSyncContainer.size()); + } +} +HWTEST_F(CrossDeviceDependenciesTests, givenMultipleEventInMultiRootDeviceEnvironmentWhenTheyDoNotHaveMultiRootSyncNodeContainersThenCsrDepsDoesNotHaveAnyMultiRootSyncContainer) { + + MockEvent event1(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 5, 15); + event1.multiRootDeviceTimestampPacketContainer.reset(new TimestampPacketContainer()); + MockEvent event2(nullptr, CL_COMMAND_NDRANGE_KERNEL, 6, 16); + MockEvent event3(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 4, 20); + event3.multiRootDeviceTimestampPacketContainer.reset(new TimestampPacketContainer()); + MockEvent event4(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 3, 4); + event4.multiRootDeviceTimestampPacketContainer.reset(new TimestampPacketContainer()); + MockEvent event5(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 3, 4); + event5.multiRootDeviceTimestampPacketContainer.reset(new TimestampPacketContainer()); + MockEvent event6(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 2, 7); + event6.multiRootDeviceTimestampPacketContainer.reset(new TimestampPacketContainer()); + UserEvent userEvent1(&pCmdQ1->getContext()); + + userEvent1.setStatus(CL_COMPLETE); + + { + cl_event eventWaitList[] = + { + &event1, + &event2, + &event3, + &event4, + &event5, + &event6, + &userEvent1, + }; + cl_uint numEventsInWaitList = sizeof(eventWaitList) / sizeof(eventWaitList[0]); + + EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, nullptr); + CsrDependencies csrDeps; + eventsRequest.fillCsrDependenciesForRootDevices(csrDeps, pCmdQ2->getGpgpuCommandStreamReceiver()); + + EXPECT_EQ(0u, csrDeps.multiRootTimeStampSyncContainer.size()); + } +} \ No newline at end of file diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_1_tests.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_1_tests.cpp index f8945bb9c2..add72377fc 100644 --- a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_1_tests.cpp +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_1_tests.cpp @@ -11,6 +11,7 @@ #include "shared/source/command_stream/wait_status.h" #include "shared/source/helpers/constants.h" #include "shared/source/helpers/logical_state_helper.h" +#include "shared/source/os_interface/device_factory.h" #include "shared/source/os_interface/hw_info_config.h" #include "shared/test/common/helpers/debug_manager_state_restore.h" #include "shared/test/common/helpers/engine_descriptor_helper.h" @@ -1216,19 +1217,19 @@ HWTEST_P(BcsDetaliedTestsWithParams, givenBltSizeWithLeftoverWhenDispatchedThenP auto allocation = buffer->getGraphicsAllocation(pDevice->getRootDeviceIndex()); auto memoryManager = static_cast(pDevice->getMemoryManager()); memoryManager->returnFakeAllocation = true; - auto blitProperties = BlitProperties::constructPropertiesForReadWrite(std::get<1>(GetParam()), //blitDirection - csr, allocation, //commandStreamReceiver - nullptr, //memObjAllocation - hostPtr, //preallocatedHostAllocation - allocation->getGpuAddress(), //memObjGpuVa - 0, //hostAllocGpuVa - hostPtrOffset, //hostPtrOffset - copyOffset, //copyOffset - bltSize, //copySize - dstRowPitch, //hostRowPitch - dstSlicePitch, //hostSlicePitch - srcRowPitch, //gpuRowPitch - srcSlicePitch //gpuSlicePitch + auto blitProperties = BlitProperties::constructPropertiesForReadWrite(std::get<1>(GetParam()), // blitDirection + csr, allocation, // commandStreamReceiver + nullptr, // memObjAllocation + hostPtr, // preallocatedHostAllocation + allocation->getGpuAddress(), // memObjGpuVa + 0, // hostAllocGpuVa + hostPtrOffset, // hostPtrOffset + copyOffset, // copyOffset + bltSize, // copySize + dstRowPitch, // hostRowPitch + dstSlicePitch, // hostSlicePitch + srcRowPitch, // gpuRowPitch + srcSlicePitch // gpuSlicePitch ); memoryManager->returnFakeAllocation = false; @@ -1321,19 +1322,19 @@ HWTEST_P(BcsDetaliedTestsWithParams, givenBltSizeWithLeftoverWhenDispatchedThenP auto memoryManager = static_cast(pDevice->getMemoryManager()); memoryManager->returnFakeAllocation = true; - auto blitProperties = BlitProperties::constructPropertiesForReadWrite(std::get<1>(GetParam()), //blitDirection - csr, allocation, //commandStreamReceiver - nullptr, //memObjAllocation - hostPtr, //preallocatedHostAllocation - allocation->getGpuAddress(), //memObjGpuVa - 0, //hostAllocGpuVa - hostPtrOffset, //hostPtrOffset - copyOffset, //copyOffset - bltSize, //copySize - dstRowPitch, //hostRowPitch - dstSlicePitch, //hostSlicePitch - srcRowPitch, //gpuRowPitch - srcSlicePitch //gpuSlicePitch + auto blitProperties = BlitProperties::constructPropertiesForReadWrite(std::get<1>(GetParam()), // blitDirection + csr, allocation, // commandStreamReceiver + nullptr, // memObjAllocation + hostPtr, // preallocatedHostAllocation + allocation->getGpuAddress(), // memObjGpuVa + 0, // hostAllocGpuVa + hostPtrOffset, // hostPtrOffset + copyOffset, // copyOffset + bltSize, // copySize + dstRowPitch, // hostRowPitch + dstSlicePitch, // hostSlicePitch + srcRowPitch, // gpuRowPitch + srcSlicePitch // gpuSlicePitch ); memoryManager->returnFakeAllocation = false; @@ -1417,16 +1418,16 @@ HWTEST_P(BcsDetaliedTestsWithParams, givenBltSizeWithLeftoverWhenDispatchedThenP size_t buffer2SlicePitch = std::get<0>(GetParam()).srcSlicePitch; auto allocation = buffer1->getGraphicsAllocation(pDevice->getRootDeviceIndex()); - auto blitProperties = BlitProperties::constructPropertiesForCopy(allocation, //dstAllocation - allocation, //srcAllocation - buffer1Offset, //dstOffset - buffer2Offset, //srcOffset - bltSize, //copySize - buffer1RowPitch, //srcRowPitch - buffer1SlicePitch, //srcSlicePitch - buffer2RowPitch, //dstRowPitch - buffer2SlicePitch, //dstSlicePitch - csr.getClearColorAllocation() //clearColorAllocation + auto blitProperties = BlitProperties::constructPropertiesForCopy(allocation, // dstAllocation + allocation, // srcAllocation + buffer1Offset, // dstOffset + buffer2Offset, // srcOffset + bltSize, // copySize + buffer1RowPitch, // srcRowPitch + buffer1SlicePitch, // srcSlicePitch + buffer2RowPitch, // dstRowPitch + buffer2SlicePitch, // dstSlicePitch + csr.getClearColorAllocation() // clearColorAllocation ); flushBcsTask(&csr, blitProperties, true, *pDevice); diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_2_tests.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_2_tests.cpp index 78b1a13ac9..2d4c5193ac 100644 --- a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_2_tests.cpp +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_2_tests.cpp @@ -1789,4 +1789,4 @@ HWTEST_F(BcsTests, givenHostPtrToImageWhenBlitBufferIsCalledThenBlitCmdIsFound) hwParser.parseCommands(csr.commandStream, 0); auto cmdIterator = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); EXPECT_NE(hwParser.cmdList.end(), cmdIterator); -} +} \ No newline at end of file diff --git a/opencl/test/unit_test/event/event_builder_tests.cpp b/opencl/test/unit_test/event/event_builder_tests.cpp index 28d1bcb19b..f5f3e0f065 100644 --- a/opencl/test/unit_test/event/event_builder_tests.cpp +++ b/opencl/test/unit_test/event/event_builder_tests.cpp @@ -79,7 +79,7 @@ TEST(EventBuilder, givenVirtualEventWithCommandThenFinalizeAddChild) { public: using CommandComputeKernel::eventsWaitlist; MockCommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr &kernelOperation, std::vector &surfaces, Kernel *kernel) - : CommandComputeKernel(commandQueue, kernelOperation, surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0) {} + : CommandComputeKernel(commandQueue, kernelOperation, surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0, nullptr) {} }; auto device = std::make_unique(MockDevice::createWithNewExecutionEnvironment(defaultHwInfo.get())); @@ -129,7 +129,7 @@ TEST(EventBuilder, givenVirtualEventWithSubmittedCommandAsParentThenFinalizeNotA public: using CommandComputeKernel::eventsWaitlist; MockCommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr &kernelOperation, std::vector &surfaces, Kernel *kernel) - : CommandComputeKernel(commandQueue, kernelOperation, surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0) {} + : CommandComputeKernel(commandQueue, kernelOperation, surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0, nullptr) {} }; auto device = std::make_unique(MockDevice::createWithNewExecutionEnvironment(defaultHwInfo.get())); diff --git a/opencl/test/unit_test/event/event_tests.cpp b/opencl/test/unit_test/event/event_tests.cpp index 022ad0a1c1..05e28a134f 100644 --- a/opencl/test/unit_test/event/event_tests.cpp +++ b/opencl/test/unit_test/event/event_tests.cpp @@ -483,7 +483,7 @@ TEST_F(InternalsEventTest, GivenSubmitCommandFalseWhenSubmittingCommandsThenRefA PreemptionMode preemptionMode = pDevice->getPreemptionMode(); v.push_back(bufferSurf); - auto cmd = new CommandComputeKernel(cmdQ, blockedCommandsData, v, false, false, false, nullptr, preemptionMode, pKernel, 1); + auto cmd = new CommandComputeKernel(cmdQ, blockedCommandsData, v, false, false, false, nullptr, preemptionMode, pKernel, 1, nullptr); event.setCommand(std::unique_ptr(cmd)); auto taskLevelBefore = csr.peekTaskLevel(); @@ -526,7 +526,7 @@ TEST_F(InternalsEventTest, GivenSubmitCommandTrueWhenSubmittingCommandsThenRefAp NullSurface *surface = new NullSurface; v.push_back(surface); PreemptionMode preemptionMode = pDevice->getPreemptionMode(); - auto cmd = new CommandComputeKernel(cmdQ, blockedCommandsData, v, false, false, false, nullptr, preemptionMode, pKernel, 1); + auto cmd = new CommandComputeKernel(cmdQ, blockedCommandsData, v, false, false, false, nullptr, preemptionMode, pKernel, 1, nullptr); event.setCommand(std::unique_ptr(cmd)); auto taskLevelBefore = csr.peekTaskLevel(); @@ -577,7 +577,7 @@ TEST_F(InternalsEventTest, givenBlockedKernelWithPrintfWhenSubmittedThenPrintOut std::vector v; PreemptionMode preemptionMode = pDevice->getPreemptionMode(); - auto cmd = new CommandComputeKernel(mockCmdQueue, blockedCommandsData, v, false, false, false, std::move(printfHandler), preemptionMode, pKernel, 1); + auto cmd = new CommandComputeKernel(mockCmdQueue, blockedCommandsData, v, false, false, false, std::move(printfHandler), preemptionMode, pKernel, 1, nullptr); event.setCommand(std::unique_ptr(cmd)); event.submitCommand(false); @@ -629,7 +629,7 @@ TEST_F(InternalsEventTest, givenGpuHangOnCmdQueueWaitFunctionAndBlockedKernelWit std::vector v; PreemptionMode preemptionMode = pDevice->getPreemptionMode(); - auto cmd = new CommandComputeKernel(mockCmdQueue, blockedCommandsData, v, false, false, false, std::move(printfHandler), preemptionMode, pKernel, 1); + auto cmd = new CommandComputeKernel(mockCmdQueue, blockedCommandsData, v, false, false, false, std::move(printfHandler), preemptionMode, pKernel, 1, nullptr); event.setCommand(std::unique_ptr(cmd)); event.submitCommand(false); @@ -678,7 +678,7 @@ TEST_F(InternalsEventTest, givenGpuHangOnPrintingEnqueueOutputAndBlockedKernelWi std::vector v; PreemptionMode preemptionMode = pDevice->getPreemptionMode(); - auto cmd = new CommandComputeKernel(mockCmdQueue, blockedCommandsData, v, false, false, false, std::move(printfHandler), preemptionMode, pKernel, 1); + auto cmd = new CommandComputeKernel(mockCmdQueue, blockedCommandsData, v, false, false, false, std::move(printfHandler), preemptionMode, pKernel, 1, nullptr); event.setCommand(std::unique_ptr(cmd)); event.submitCommand(false); @@ -1167,7 +1167,7 @@ HWTEST_F(EventTest, givenVirtualEventWhenCommandSubmittedThenLockCsrOccurs) { public: using CommandComputeKernel::eventsWaitlist; MockCommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr &kernelOperation, std::vector &surfaces, Kernel *kernel) - : CommandComputeKernel(commandQueue, kernelOperation, surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0) {} + : CommandComputeKernel(commandQueue, kernelOperation, surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0, nullptr) {} }; class MockEvent : public Event { public: @@ -1748,7 +1748,7 @@ HWTEST_F(InternalsEventTest, givenAbortedCommandWhenSubmitCalledThenDontUpdateFl blockedCommandsData->setHeaps(dsh, ioh, ssh); PreemptionMode preemptionMode = pDevice->getPreemptionMode(); std::vector v; - auto cmd = new CommandComputeKernel(*pCmdQ, blockedCommandsData, v, false, false, false, nullptr, preemptionMode, pKernel, 1); + auto cmd = new CommandComputeKernel(*pCmdQ, blockedCommandsData, v, false, false, false, nullptr, preemptionMode, pKernel, 1, nullptr); event->setCommand(std::unique_ptr(cmd)); FlushStamp expectedFlushStamp = 0; diff --git a/opencl/test/unit_test/gtpin/gtpin_tests.cpp b/opencl/test/unit_test/gtpin/gtpin_tests.cpp index 1cdcdba559..77e18d5cd8 100644 --- a/opencl/test/unit_test/gtpin/gtpin_tests.cpp +++ b/opencl/test/unit_test/gtpin/gtpin_tests.cpp @@ -2434,7 +2434,7 @@ HWTEST_F(GTPinTests, givenGtPinInitializedWhenSubmittingKernelCommandThenFlushed gtpinNotifyKernelSubmit(kernel.mockMultiDeviceKernel, mockCmdQ.get()); - std::unique_ptr command(new CommandComputeKernel(*mockCmdQ, kernelOperation, surfaces, flushDC, slmUsed, ndRangeKernel, nullptr, preemptionMode, kernel, 1)); + std::unique_ptr command(new CommandComputeKernel(*mockCmdQ, kernelOperation, surfaces, flushDC, slmUsed, ndRangeKernel, nullptr, preemptionMode, kernel, 1, nullptr)); CompletionStamp stamp = command->submit(20, false); ASSERT_EQ(1u, kernelExecQueue.size()); diff --git a/opencl/test/unit_test/helpers/task_information_tests.cpp b/opencl/test/unit_test/helpers/task_information_tests.cpp index ead89cf244..404a69b640 100644 --- a/opencl/test/unit_test/helpers/task_information_tests.cpp +++ b/opencl/test/unit_test/helpers/task_information_tests.cpp @@ -154,7 +154,7 @@ TEST(CommandTest, givenWaitlistRequestWhenCommandComputeKernelIsCreatedThenMakeL public: using CommandComputeKernel::eventsWaitlist; MockCommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr &kernelOperation, std::vector &surfaces, Kernel *kernel) - : CommandComputeKernel(commandQueue, kernelOperation, surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0) {} + : CommandComputeKernel(commandQueue, kernelOperation, surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0, nullptr) {} }; auto device = std::make_unique(MockDevice::createWithNewExecutionEnvironment(defaultHwInfo.get())); @@ -291,7 +291,7 @@ HWTEST_F(DispatchFlagsTests, givenCommandComputeKernelWhenSubmitThenPassCorrectD for (auto &surface : surfaces) { requiresCoherency |= surface->IsCoherent; } - std::unique_ptr command(new CommandComputeKernel(*mockCmdQ, kernelOperation, surfaces, flushDC, slmUsed, ndRangeKernel, nullptr, preemptionMode, kernel, 1)); + std::unique_ptr command(new CommandComputeKernel(*mockCmdQ, kernelOperation, surfaces, flushDC, slmUsed, ndRangeKernel, nullptr, preemptionMode, kernel, 1, nullptr)); command->submit(20, false); EXPECT_FALSE(mockCsr->passedDispatchFlags.pipelineSelectArgs.systolicPipelineSelectMode); @@ -339,7 +339,7 @@ HWTEST_F(DispatchFlagsTests, givenClCommandCopyImageWhenSubmitThenFlushTextureCa for (auto &surface : surfaces) { requiresCoherency |= surface->IsCoherent; } - std::unique_ptr command(new CommandComputeKernel(*mockCmdQ, kernelOperation, surfaces, flushDC, slmUsed, commandType, nullptr, preemptionMode, kernel, 1)); + std::unique_ptr command(new CommandComputeKernel(*mockCmdQ, kernelOperation, surfaces, flushDC, slmUsed, commandType, nullptr, preemptionMode, kernel, 1, nullptr)); command->submit(20, false); EXPECT_FALSE(mockCsr->passedDispatchFlags.pipelineSelectArgs.systolicPipelineSelectMode); @@ -425,7 +425,7 @@ HWTEST_F(DispatchFlagsTests, givenCommandComputeKernelWhenSubmitThenPassCorrectD bool flushDC = false; bool slmUsed = false; bool ndRangeKernel = false; - std::unique_ptr command(new CommandComputeKernel(*mockCmdQ, kernelOperation, surfaces, flushDC, slmUsed, ndRangeKernel, nullptr, preemptionMode, kernel, 1)); + std::unique_ptr command(new CommandComputeKernel(*mockCmdQ, kernelOperation, surfaces, flushDC, slmUsed, ndRangeKernel, nullptr, preemptionMode, kernel, 1, nullptr)); command->submit(20, false); EXPECT_TRUE(mockCsr->passedDispatchFlags.epilogueRequired); diff --git a/opencl/test/unit_test/helpers/timestamp_packet_1_tests.cpp b/opencl/test/unit_test/helpers/timestamp_packet_1_tests.cpp index 529be63491..d4e224a033 100644 --- a/opencl/test/unit_test/helpers/timestamp_packet_1_tests.cpp +++ b/opencl/test/unit_test/helpers/timestamp_packet_1_tests.cpp @@ -34,11 +34,11 @@ HWCMDTEST_F(IGFX_GEN8_CORE, TimestampPacketTests, givenTimestampPacketWriteEnabl auto mockCmdQHw = std::make_unique>(context, device.get(), nullptr); device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = false; - getCommandStream(*mockCmdQHw, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false); + getCommandStream(*mockCmdQHw, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false, nullptr); auto sizeWithDisabled = mockCmdQHw->requestedCmdStreamSize; device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; - getCommandStream(*mockCmdQHw, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false); + getCommandStream(*mockCmdQHw, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false, nullptr); auto sizeWithEnabled = mockCmdQHw->requestedCmdStreamSize; auto extendedSize = sizeWithDisabled + sizeof(typename FamilyType::PIPE_CONTROL); @@ -52,7 +52,7 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledAndOoqWhenEstimat device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = false; getCommandStream(*mockCmdQHw, CsrDependencies(), false, false, - false, multiDispatchInfo, nullptr, 0, false, false); + false, multiDispatchInfo, nullptr, 0, false, false, nullptr); auto sizeWithDisabled = mockCmdQHw->requestedCmdStreamSize; device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; @@ -82,7 +82,7 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledAndOoqWhenEstimat eventsRequest.fillCsrDependenciesForTimestampPacketContainer( csrDeps, device->getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr); - getCommandStream(*mockCmdQHw, csrDeps, false, false, false, multiDispatchInfo, nullptr, 0, false, false); + getCommandStream(*mockCmdQHw, csrDeps, false, false, false, multiDispatchInfo, nullptr, 0, false, false, nullptr); auto sizeWithEnabled = mockCmdQHw->requestedCmdStreamSize; size_t sizeForNodeDependency = 0; @@ -143,7 +143,7 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledWhenEstimatingStr auto mockCmdQHw = std::make_unique>(context, device.get(), nullptr); device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = false; - getCommandStream(*mockCmdQHw, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false); + getCommandStream(*mockCmdQHw, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false, nullptr); auto sizeWithDisabled = mockCmdQHw->requestedCmdStreamSize; device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; @@ -172,7 +172,7 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledWhenEstimatingStr CsrDependencies csrDeps; eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, device->getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr); - getCommandStream(*mockCmdQHw, csrDeps, false, false, false, multiDispatchInfo, nullptr, 0, false, false); + getCommandStream(*mockCmdQHw, csrDeps, false, false, false, multiDispatchInfo, nullptr, 0, false, false, nullptr); auto sizeWithEnabled = mockCmdQHw->requestedCmdStreamSize; size_t sizeForNodeDependency = 0; diff --git a/opencl/test/unit_test/kernel/kernel_cache_flush_requirements_tests.cpp b/opencl/test/unit_test/kernel/kernel_cache_flush_requirements_tests.cpp index 7f5bffca9e..a996fba7bc 100644 --- a/opencl/test/unit_test/kernel/kernel_cache_flush_requirements_tests.cpp +++ b/opencl/test/unit_test/kernel/kernel_cache_flush_requirements_tests.cpp @@ -212,7 +212,7 @@ HWTEST2_F(KernelWithCacheFlushTests, givenCacheFlushRequiredWhenEstimatingThenAd { EXPECT_FALSE(mockKernel->mockKernel->Kernel::requiresCacheFlushCommand(*cmdQ)); - initialSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, csrDeps, false, false, false, *cmdQ, multiDispatchInfo, false, false); + initialSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, csrDeps, false, false, false, *cmdQ, multiDispatchInfo, false, false, nullptr); } { @@ -226,7 +226,7 @@ HWTEST2_F(KernelWithCacheFlushTests, givenCacheFlushRequiredWhenEstimatingThenAd ultCsr.multiOsContextCapable = false; EXPECT_TRUE(mockKernel->mockKernel->Kernel::requiresCacheFlushCommand(*cmdQ)); - sizeWithCacheFlush = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, csrDeps, false, false, false, *cmdQ, multiDispatchInfo, false, false); + sizeWithCacheFlush = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, csrDeps, false, false, false, *cmdQ, multiDispatchInfo, false, false, nullptr); } EXPECT_EQ(initialSize + expectedDiff, sizeWithCacheFlush); diff --git a/opencl/test/unit_test/kernel/kernel_tests.cpp b/opencl/test/unit_test/kernel/kernel_tests.cpp index c3c1317286..a6c4ea6b03 100644 --- a/opencl/test/unit_test/kernel/kernel_tests.cpp +++ b/opencl/test/unit_test/kernel/kernel_tests.cpp @@ -503,6 +503,7 @@ class CommandStreamReceiverMock : public CommandStreamReceiver { using BaseClass::CommandStreamReceiver; TagAllocatorBase *getTimestampPacketAllocator() override { return nullptr; } + std::unique_ptr createMultiRootDeviceTimestampPacketAllocator(const RootDeviceIndicesContainer rootDeviceIndices) override { return std::unique_ptr(nullptr); } SubmissionStatus flushTagUpdate() override { return SubmissionStatus::SUCCESS; }; void updateTagFromWait() override{}; diff --git a/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp b/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp index de4f8e5f14..6d6bb63fe2 100644 --- a/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp +++ b/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp @@ -765,11 +765,11 @@ HWTEST_TEMPLATED_F(BcsBufferTests, givenBufferOperationWithoutKernelWhenEstimati auto &hwInfo = cmdQ->getDevice().getHardwareInfo(); auto readBufferCmdsSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_READ_BUFFER, csrDependencies, false, false, - true, *cmdQ, multiDispatchInfo, false, false); + true, *cmdQ, multiDispatchInfo, false, false, nullptr); auto writeBufferCmdsSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_WRITE_BUFFER, csrDependencies, false, false, - true, *cmdQ, multiDispatchInfo, false, false); + true, *cmdQ, multiDispatchInfo, false, false, nullptr); auto copyBufferCmdsSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_COPY_BUFFER, csrDependencies, false, false, - true, *cmdQ, multiDispatchInfo, false, false); + true, *cmdQ, multiDispatchInfo, false, false, nullptr); auto expectedSize = TimestampPacketHelper::getRequiredCmdStreamSizeForNodeDependencyWithBlitEnqueue(); if (cmdQ->isCacheFlushForBcsRequired()) { diff --git a/opencl/test/unit_test/mocks/mock_command_queue.h b/opencl/test/unit_test/mocks/mock_command_queue.h index 1fe5fc2f00..a0f9db020e 100644 --- a/opencl/test/unit_test/mocks/mock_command_queue.h +++ b/opencl/test/unit_test/mocks/mock_command_queue.h @@ -248,6 +248,7 @@ class MockCommandQueueHw : public CommandQueueHw { using BaseClass::latestSentEnqueueType; using BaseClass::obtainCommandStream; using BaseClass::obtainNewTimestampPacketNodes; + using BaseClass::processDispatchForKernels; using BaseClass::requiresCacheFlushAfterWalker; using BaseClass::throttle; using BaseClass::timestampPacketContainer; diff --git a/opencl/test/unit_test/mocks/mock_event.h b/opencl/test/unit_test/mocks/mock_event.h index 376e638f39..c579b42596 100644 --- a/opencl/test/unit_test/mocks/mock_event.h +++ b/opencl/test/unit_test/mocks/mock_event.h @@ -39,6 +39,7 @@ struct MockEvent : public BaseEventType { using Event::calculateSubmitTimestampData; using Event::isWaitForTimestampsEnabled; using Event::magic; + using Event::multiRootDeviceTimestampPacketContainer; using Event::queueTimeStamp; using Event::submitTimeStamp; using Event::timestampPacketContainer; diff --git a/opencl/test/unit_test/profiling/profiling_tests.cpp b/opencl/test/unit_test/profiling/profiling_tests.cpp index d456c5792d..49467a6ce7 100644 --- a/opencl/test/unit_test/profiling/profiling_tests.cpp +++ b/opencl/test/unit_test/profiling/profiling_tests.cpp @@ -71,13 +71,13 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GivenCommandQueueWithProfilingAndFor MultiDispatchInfo multiDispatchInfo(&kernel); auto &commandStreamNDRangeKernel = getCommandStream(*pCmdQ, CsrDependencies(), true, false, false, - multiDispatchInfo, nullptr, 0, false, false); + multiDispatchInfo, nullptr, 0, false, false, nullptr); auto expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, true, false, *pCmdQ, &kernel, {}); EXPECT_GE(expectedSizeCS, requiredSize); EXPECT_GE(commandStreamNDRangeKernel.getAvailableSpace(), requiredSize); auto &commandStreamTask = getCommandStream(*pCmdQ, CsrDependencies(), true, false, false, - multiDispatchInfo, nullptr, 0, false, false); + multiDispatchInfo, nullptr, 0, false, false, nullptr); expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_TASK, true, false, *pCmdQ, &kernel, {}); EXPECT_GE(expectedSizeCS, requiredSize); EXPECT_GE(commandStreamTask.getAvailableSpace(), requiredSize); @@ -93,13 +93,13 @@ HWTEST_F(ProfilingTests, GivenCommandQueueWithProfilingAndForWorkloadWithNoKerne MultiDispatchInfo multiDispatchInfo(nullptr); auto &commandStreamMigrateMemObjects = getCommandStream(*pCmdQ, CsrDependencies(), true, false, false, - multiDispatchInfo, nullptr, 0, false, false); + multiDispatchInfo, nullptr, 0, false, false, nullptr); auto expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_MIGRATE_MEM_OBJECTS, true, false, *pCmdQ, nullptr, {}); EXPECT_GE(expectedSizeCS, requiredSize); EXPECT_GE(commandStreamMigrateMemObjects.getAvailableSpace(), requiredSize); auto &commandStreamMarker = getCommandStream(*pCmdQ, CsrDependencies(), true, - false, false, multiDispatchInfo, nullptr, 0, false, false); + false, false, multiDispatchInfo, nullptr, 0, false, false, nullptr); expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_MARKER, true, false, *pCmdQ, nullptr, {}); EXPECT_GE(expectedSizeCS, requiredSize); EXPECT_GE(commandStreamMarker.getAvailableSpace(), requiredSize); @@ -121,9 +121,9 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GivenCommandQueueWithProfilingAndFor multiDispatchInfo.push(dispatchInfo); multiDispatchInfo.push(dispatchInfo); auto &commandStreamTask = getCommandStream(*pCmdQ, CsrDependencies(), true, false, false, - multiDispatchInfo, nullptr, 0, false, false); + multiDispatchInfo, nullptr, 0, false, false, nullptr); auto expectedSizeCS = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_TASK, CsrDependencies(), true, false, - false, *pCmdQ, multiDispatchInfo, false, false); + false, *pCmdQ, multiDispatchInfo, false, false, nullptr); EXPECT_GE(expectedSizeCS, requiredSize); EXPECT_GE(commandStreamTask.getAvailableSpace(), requiredSize); } @@ -741,13 +741,13 @@ HWTEST_F(ProfilingWithPerfCountersTests, GivenCommandQueueWithProfilingPerfCount MultiDispatchInfo multiDispatchInfo(nullptr); auto &commandStreamMigrateMemObjects = getCommandStream(*pCmdQ, CsrDependencies(), true, true, false, multiDispatchInfo, - nullptr, 0, false, false); + nullptr, 0, false, false, nullptr); auto expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_MIGRATE_MEM_OBJECTS, true, true, *pCmdQ, nullptr, {}); EXPECT_GE(expectedSizeCS, requiredSize); EXPECT_GE(commandStreamMigrateMemObjects.getAvailableSpace(), requiredSize); auto &commandStreamMarker = getCommandStream(*pCmdQ, CsrDependencies(), true, true, false, - multiDispatchInfo, nullptr, 0, false, false); + multiDispatchInfo, nullptr, 0, false, false, nullptr); expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_MARKER, true, true, *pCmdQ, nullptr, {}); EXPECT_GE(expectedSizeCS, requiredSize); EXPECT_GE(commandStreamMarker.getAvailableSpace(), requiredSize); diff --git a/opencl/test/unit_test/xe_hpc_core/pvc/command_stream_receiver_hw_tests_pvc.cpp b/opencl/test/unit_test/xe_hpc_core/pvc/command_stream_receiver_hw_tests_pvc.cpp index e75268af99..9f929df14f 100644 --- a/opencl/test/unit_test/xe_hpc_core/pvc/command_stream_receiver_hw_tests_pvc.cpp +++ b/opencl/test/unit_test/xe_hpc_core/pvc/command_stream_receiver_hw_tests_pvc.cpp @@ -243,10 +243,14 @@ PVCTEST_F(PvcMultiRootDeviceCommandStreamReceiverBufferTests, givenMultipleEvent MockGraphicsAllocation svmAlloc(svmPtr, svmSize); Event event1(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 5, 15); + auto node1 = event1.getMultiRootTimestampSyncNode(); Event event2(nullptr, CL_COMMAND_NDRANGE_KERNEL, 6, 16); Event event3(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 4, 20); + auto node3 = event3.getMultiRootTimestampSyncNode(); Event event4(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 3, 4); + auto node4 = event4.getMultiRootTimestampSyncNode(); Event event5(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 2, 7); + auto node5 = event5.getMultiRootTimestampSyncNode(); UserEvent userEvent1(&pCmdQ1->getContext()); UserEvent userEvent2(&pCmdQ2->getContext()); @@ -285,12 +289,12 @@ PVCTEST_F(PvcMultiRootDeviceCommandStreamReceiverBufferTests, givenMultipleEvent EXPECT_EQ(3u, semaphores.size()); auto semaphoreCmd0 = genCmdCast(*(semaphores[0])); - EXPECT_EQ(4u, semaphoreCmd0->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ2->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd0->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(node4->getContextEndAddress(0u)), semaphoreCmd0->getSemaphoreGraphicsAddress()); auto semaphoreCmd1 = genCmdCast(*(semaphores[1])); - EXPECT_EQ(7u, semaphoreCmd1->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ2->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd1->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd1->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(node5->getContextEndAddress(0u)), semaphoreCmd1->getSemaphoreGraphicsAddress()); } { @@ -313,12 +317,12 @@ PVCTEST_F(PvcMultiRootDeviceCommandStreamReceiverBufferTests, givenMultipleEvent EXPECT_EQ(3u, semaphores.size()); auto semaphoreCmd0 = genCmdCast(*(semaphores[0])); - EXPECT_EQ(15u, semaphoreCmd0->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd0->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(node1->getContextEndAddress(0u)), semaphoreCmd0->getSemaphoreGraphicsAddress()); auto semaphoreCmd1 = genCmdCast(*(semaphores[1])); - EXPECT_EQ(20u, semaphoreCmd1->getSemaphoreDataDword()); - EXPECT_EQ(reinterpret_cast(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd1->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd1->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(node3->getContextEndAddress(0u)), semaphoreCmd1->getSemaphoreGraphicsAddress()); } alignedFree(svmPtr); } diff --git a/shared/source/command_stream/command_stream_receiver.h b/shared/source/command_stream/command_stream_receiver.h index b91ca820ac..bb184ace2f 100644 --- a/shared/source/command_stream/command_stream_receiver.h +++ b/shared/source/command_stream/command_stream_receiver.h @@ -221,6 +221,7 @@ class CommandStreamReceiver { TagAllocatorBase *getEventTsAllocator(); TagAllocatorBase *getEventPerfCountAllocator(const uint32_t tagSize); virtual TagAllocatorBase *getTimestampPacketAllocator() = 0; + virtual std::unique_ptr createMultiRootDeviceTimestampPacketAllocator(const RootDeviceIndicesContainer rootDeviceIndices) = 0; virtual bool expectMemory(const void *gfxAddress, const void *srcAddress, size_t length, uint32_t compareOperation); diff --git a/shared/source/command_stream/command_stream_receiver_hw.h b/shared/source/command_stream/command_stream_receiver_hw.h index 730a909b7c..38cec769c6 100644 --- a/shared/source/command_stream/command_stream_receiver_hw.h +++ b/shared/source/command_stream/command_stream_receiver_hw.h @@ -134,6 +134,7 @@ class CommandStreamReceiverHw : public CommandStreamReceiver { GraphicsAllocation *getClearColorAllocation() override; TagAllocatorBase *getTimestampPacketAllocator() override; + std::unique_ptr createMultiRootDeviceTimestampPacketAllocator(const RootDeviceIndicesContainer rootDeviceIndices) override; void postInitFlagsSetup() override; void programActivePartitionConfig(LinearStream &csr); diff --git a/shared/source/command_stream/command_stream_receiver_hw_base.inl b/shared/source/command_stream/command_stream_receiver_hw_base.inl index c48bb794a2..7c355910a3 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_base.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_base.inl @@ -403,7 +403,7 @@ CompletionStamp CommandStreamReceiverHw::flushTask( auto commandStreamStartCSR = commandStreamCSR.getUsed(); TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer(commandStreamCSR, dispatchFlags.csrDependencies); - TimestampPacketHelper::programCsrDependenciesForForTaskCountContainer(commandStreamCSR, dispatchFlags.csrDependencies); + TimestampPacketHelper::programCsrDependenciesForForMultiRootDeviceSyncContainer(commandStreamCSR, dispatchFlags.csrDependencies); programActivePartitionConfigFlushTask(commandStreamCSR); programEngineModeCommands(commandStreamCSR, dispatchFlags); @@ -977,7 +977,7 @@ size_t CommandStreamReceiverHw::getRequiredCmdStreamSize(const Dispat } size += TimestampPacketHelper::getRequiredCmdStreamSize(dispatchFlags.csrDependencies); - size += TimestampPacketHelper::getRequiredCmdStreamSizeForTaskCountContainer(dispatchFlags.csrDependencies); + size += TimestampPacketHelper::getRequiredCmdStreamSizeForMultiRootDeviceSyncNodesContainer(dispatchFlags.csrDependencies); size += EncodeKernelArgsBuffer::getKernelArgsBufferCmdsSize(kernelArgsBufferAllocation, logicalStateHelper.get()); @@ -1193,7 +1193,7 @@ TaskCountType CommandStreamReceiverHw::flushBcsTask(const BlitPropert for (auto &blitProperties : blitPropertiesContainer) { TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer(commandStream, blitProperties.csrDependencies); - TimestampPacketHelper::programCsrDependenciesForForTaskCountContainer(commandStream, blitProperties.csrDependencies); + TimestampPacketHelper::programCsrDependenciesForForMultiRootDeviceSyncContainer(commandStream, blitProperties.csrDependencies); BlitCommandsHelper::encodeWa(commandStream, blitProperties, latestSentBcsWaValue); @@ -1226,6 +1226,12 @@ TaskCountType CommandStreamReceiverHw::flushBcsTask(const BlitPropert if (blitProperties.clearColorAllocation) { makeResident(*blitProperties.clearColorAllocation); } + if (blitProperties.multiRootDeviceEventSync != nullptr) { + MiFlushArgs args; + args.commandWithPostSync = true; + args.notifyEnable = isUsedNotifyEnableForPostSync(); + EncodeMiFlushDW::programMiFlushDw(commandStream, blitProperties.multiRootDeviceEventSync->getGpuAddress() + blitProperties.multiRootDeviceEventSync->getContextEndOffset(), std::numeric_limits::max(), args, hwInfo); + } } BlitCommandsHelper::programGlobalSequencerFlush(commandStream); @@ -1242,7 +1248,6 @@ TaskCountType CommandStreamReceiverHw::flushBcsTask(const BlitPropert MemorySynchronizationCommands::addAdditionalSynchronization(commandStream, tagAllocation->getGpuAddress(), false, peekHwInfo()); } - if (PauseOnGpuProperties::pauseModeAllowed(DebugManager.flags.PauseOnBlitCopy.get(), taskCount, PauseOnGpuProperties::PauseMode::AfterWorkload)) { BlitCommandsHelper::dispatchDebugPauseCommands(commandStream, getDebugPauseStateGPUAddress(), DebugPauseState::waitingForUserEndConfirmation, @@ -1519,6 +1524,11 @@ TagAllocatorBase *CommandStreamReceiverHw::getTimestampPacketAllocato return timestampPacketAllocator.get(); } +template +std::unique_ptr CommandStreamReceiverHw::createMultiRootDeviceTimestampPacketAllocator(const RootDeviceIndicesContainer rootDeviceIndices) { + auto &gfxCoreHelper = getGfxCoreHelper(); + return gfxCoreHelper.createTimestampPacketAllocator(rootDeviceIndices, getMemoryManager(), getPreferredTagPoolSize(), getType(), osContext->getDeviceBitfield()); +} template void CommandStreamReceiverHw::postInitFlagsSetup() { useNewResourceImplicitFlush = checkPlatformSupportsNewResourceImplicitFlush(); diff --git a/shared/source/command_stream/csr_deps.h b/shared/source/command_stream/csr_deps.h index a601179e88..7ba5c46615 100644 --- a/shared/source/command_stream/csr_deps.h +++ b/shared/source/command_stream/csr_deps.h @@ -22,7 +22,7 @@ class CsrDependencies { All }; - StackVec, 32> taskCountContainer; + StackVec multiRootTimeStampSyncContainer; StackVec timestampPacketContainer; void makeResident(CommandStreamReceiver &commandStreamReceiver) const; diff --git a/shared/source/helpers/blit_commands_helper.cpp b/shared/source/helpers/blit_commands_helper.cpp index 57cb67c5de..b3c198eacc 100644 --- a/shared/source/helpers/blit_commands_helper.cpp +++ b/shared/source/helpers/blit_commands_helper.cpp @@ -50,6 +50,7 @@ BlitProperties BlitProperties::constructPropertiesForReadWrite(BlitterConstants: BlitterConstants::BlitDirection::HostPtrToImage == blitDirection) { return { nullptr, // outputTimestampPacket + nullptr, // multiRootDeviceEventSync blitDirection, // blitDirection {}, // csrDependencies AuxTranslationDirection::None, // auxTranslationDirection @@ -73,6 +74,7 @@ BlitProperties BlitProperties::constructPropertiesForReadWrite(BlitterConstants: } else { return { nullptr, // outputTimestampPacket + nullptr, // multiRootDeviceEventSync blitDirection, // blitDirection {}, // csrDependencies AuxTranslationDirection::None, // auxTranslationDirection @@ -104,6 +106,7 @@ BlitProperties BlitProperties::constructPropertiesForCopy(GraphicsAllocation *ds return { nullptr, // outputTimestampPacket + nullptr, // multiRootDeviceEventSync BlitterConstants::BlitDirection::BufferToBuffer, // blitDirection {}, // csrDependencies AuxTranslationDirection::None, // auxTranslationDirection @@ -128,6 +131,7 @@ BlitProperties BlitProperties::constructPropertiesForAuxTranslation(AuxTranslati auto allocationSize = allocation->getUnderlyingBufferSize(); return { nullptr, // outputTimestampPacket + nullptr, // multiRootDeviceEventSync BlitterConstants::BlitDirection::BufferToBuffer, // blitDirection {}, // csrDependencies auxTranslationDirection, // auxTranslationDirection diff --git a/shared/source/helpers/blit_commands_helper.h b/shared/source/helpers/blit_commands_helper.h index f5f4d345bd..2c7ef5f0fd 100644 --- a/shared/source/helpers/blit_commands_helper.h +++ b/shared/source/helpers/blit_commands_helper.h @@ -61,6 +61,7 @@ struct BlitProperties { CommandStreamReceiver &gpguCsr, CommandStreamReceiver &bcsCsr); TagNodeBase *outputTimestampPacket = nullptr; + TagNodeBase *multiRootDeviceEventSync = nullptr; BlitterConstants::BlitDirection blitDirection = BlitterConstants::BlitDirection::BufferToHostPtr; CsrDependencies csrDependencies; AuxTranslationDirection auxTranslationDirection = AuxTranslationDirection::None; diff --git a/shared/source/helpers/blit_commands_helper_base.inl b/shared/source/helpers/blit_commands_helper_base.inl index 0507ecce64..9c5aae061d 100644 --- a/shared/source/helpers/blit_commands_helper_base.inl +++ b/shared/source/helpers/blit_commands_helper_base.inl @@ -125,7 +125,7 @@ size_t BlitCommandsHelper::estimateBlitCommandSize(const Vec3 sizePerBlit += estimatePostBlitCommandSize(); return TimestampPacketHelper::getRequiredCmdStreamSize(csrDependencies) + - TimestampPacketHelper::getRequiredCmdStreamSizeForTaskCountContainer(csrDependencies) + + TimestampPacketHelper::getRequiredCmdStreamSizeForMultiRootDeviceSyncNodesContainer(csrDependencies) + (sizePerBlit * nBlits) + timestampCmdSize + estimatePreBlitCommandSize(); @@ -141,6 +141,9 @@ size_t BlitCommandsHelper::estimateBlitCommandsSize(const BlitPropert auto isImage = blitProperties.isImageOperation(); size += BlitCommandsHelper::estimateBlitCommandSize(blitProperties.copySize, blitProperties.csrDependencies, updateTimestampPacket, profilingEnabled, isImage, rootDeviceEnvironment, blitProperties.isSystemMemoryPoolUsed); + if (blitProperties.multiRootDeviceEventSync != nullptr) { + size += EncodeMiFlushDW::getMiFlushDwCmdSizeForDataWrite(); + } } size += BlitCommandsHelper::getWaCmdsSize(blitPropertiesContainer); size += 2 * MemorySynchronizationCommands::getSizeForAdditonalSynchronization(*rootDeviceEnvironment.getHardwareInfo()); diff --git a/shared/source/helpers/timestamp_packet.h b/shared/source/helpers/timestamp_packet.h index c42aa3d5c3..dae11adb4e 100644 --- a/shared/source/helpers/timestamp_packet.h +++ b/shared/source/helpers/timestamp_packet.h @@ -145,17 +145,11 @@ struct TimestampPacketHelper { } template - static void programCsrDependenciesForForTaskCountContainer(LinearStream &cmdStream, const CsrDependencies &csrDependencies) { - auto &taskCountContainer = csrDependencies.taskCountContainer; - - for (auto &[taskCountPreviousRootDevice, tagAddressPreviousRootDevice] : taskCountContainer) { - using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION; - using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT; - - EncodeSempahore::addMiSemaphoreWaitCommand(cmdStream, - static_cast(tagAddressPreviousRootDevice), - static_cast(taskCountPreviousRootDevice), - COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD); + static void programCsrDependenciesForForMultiRootDeviceSyncContainer(LinearStream &cmdStream, const CsrDependencies &csrDependencies) { + for (auto timestampPacketContainer : csrDependencies.multiRootTimeStampSyncContainer) { + for (auto &node : timestampPacketContainer->peekNodes()) { + TimestampPacketHelper::programSemaphore(cmdStream, *node); + } } } @@ -218,8 +212,8 @@ struct TimestampPacketHelper { } template - static size_t getRequiredCmdStreamSizeForTaskCountContainer(const CsrDependencies &csrDependencies) { - return csrDependencies.taskCountContainer.size() * sizeof(typename GfxFamily::MI_SEMAPHORE_WAIT); + static size_t getRequiredCmdStreamSizeForMultiRootDeviceSyncNodesContainer(const CsrDependencies &csrDependencies) { + return csrDependencies.multiRootTimeStampSyncContainer.size() * sizeof(typename GfxFamily::MI_SEMAPHORE_WAIT); } }; diff --git a/shared/test/common/mocks/mock_command_stream_receiver.h b/shared/test/common/mocks/mock_command_stream_receiver.h index 00bd156ba3..4b3e21b209 100644 --- a/shared/test/common/mocks/mock_command_stream_receiver.h +++ b/shared/test/common/mocks/mock_command_stream_receiver.h @@ -17,6 +17,7 @@ #include "shared/source/memory_manager/graphics_allocation.h" #include "shared/source/memory_manager/surface.h" #include "shared/source/os_interface/os_context.h" +#include "shared/source/utilities/tag_allocator.h" #include "shared/test/common/helpers/dispatch_flags_helper.h" #include @@ -93,6 +94,7 @@ class MockCommandStreamReceiver : public CommandStreamReceiver { }; TagAllocatorBase *getTimestampPacketAllocator() override { return nullptr; } + std::unique_ptr createMultiRootDeviceTimestampPacketAllocator(const RootDeviceIndicesContainer rootDeviceIndices) override { return std::unique_ptr(nullptr); } CompletionStamp flushTask( LinearStream &commandStream, diff --git a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp index bba0fd020b..9a8cd4323a 100644 --- a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp +++ b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp @@ -15,10 +15,13 @@ #include "shared/source/helpers/api_specific_config.h" #include "shared/source/memory_manager/internal_allocation_storage.h" #include "shared/source/memory_manager/surface.h" +#include "shared/source/memory_manager/unified_memory_manager.h" #include "shared/source/os_interface/device_factory.h" #include "shared/source/os_interface/hw_info_config.h" #include "shared/source/os_interface/os_interface.h" #include "shared/source/utilities/tag_allocator.h" +#include "shared/test/common/cmd_parse/gen_cmd_parse.h" +#include "shared/test/common/cmd_parse/hw_parse.h" #include "shared/test/common/fixtures/command_stream_receiver_fixture.inl" #include "shared/test/common/fixtures/device_fixture.h" #include "shared/test/common/helpers/batch_buffer_helper.h" @@ -32,6 +35,7 @@ #include "shared/test/common/mocks/mock_execution_environment.h" #include "shared/test/common/mocks/mock_internal_allocation_storage.h" #include "shared/test/common/mocks/mock_memory_manager.h" +#include "shared/test/common/mocks/mock_timestamp_container.h" #include "shared/test/common/mocks/ult_device_factory.h" #include "shared/test/common/test_macros/hw_test.h" #include "shared/test/common/test_macros/test_checks_shared.h" @@ -2461,3 +2465,89 @@ HWTEST_F(CommandStreamReceiverHwTest, givenVariousCsrModeWhenGettingTbxModeThenE ultCsr.commandStreamReceiverType = CommandStreamReceiverType::CSR_TBX_WITH_AUB; EXPECT_TRUE(ultCsr.isTbxMode()); } + +HWTEST_F(CommandStreamReceiverHwTest, GivenTwoRootDevicesWhengetMultiRootDeviceTimestampPacketAllocatorCalledThenAllocatorForTwoDevicesCreated) { + auto executionEnvironment = std::make_unique(defaultHwInfo.get(), true, 2u); + auto devices = DeviceFactory::createDevices(*executionEnvironment.release()); + const RootDeviceIndicesContainer indices = {0u, 1u}; + auto csr = devices[0]->getDefaultEngine().commandStreamReceiver; + auto allocator = csr->createMultiRootDeviceTimestampPacketAllocator(indices); + class MockTagAllocatorBase : public TagAllocatorBase { + public: + using TagAllocatorBase::maxRootDeviceIndex; + }; + EXPECT_EQ(reinterpret_cast(allocator.get())->maxRootDeviceIndex, 1u); +} +HWTEST_F(CommandStreamReceiverHwTest, GivenFiveRootDevicesWhengetMultiRootDeviceTimestampPacketAllocatorCalledThenAllocatorForFiveDevicesCreated) { + auto executionEnvironment = std::make_unique(defaultHwInfo.get(), true, 4u); + auto devices = DeviceFactory::createDevices(*executionEnvironment.release()); + const RootDeviceIndicesContainer indices = {0u, 1u, 2u, 3u}; + auto csr = devices[0]->getDefaultEngine().commandStreamReceiver; + auto allocator = csr->createMultiRootDeviceTimestampPacketAllocator(indices); + class MockTagAllocatorBase : public TagAllocatorBase { + public: + using TagAllocatorBase::maxRootDeviceIndex; + }; + EXPECT_EQ(reinterpret_cast(allocator.get())->maxRootDeviceIndex, 3u); +} +HWTEST_F(CommandStreamReceiverHwTest, givenMultiRootDeviceSyncNodeWhenFlushBcsTAskThenMiFlushAdded) { + using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW; + auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); + auto mockTagAllocator = std::make_unique>(pDevice->getRootDeviceIndex(), pDevice->getExecutionEnvironment()->memoryManager.get(), 10u); + + auto blitProperties = BlitProperties::constructPropertiesForReadWrite(BlitterConstants::BlitDirection::BufferToHostPtr, + commandStreamReceiver, commandStreamReceiver.getTagAllocation(), nullptr, + commandStreamReceiver.getTagAllocation()->getUnderlyingBuffer(), + commandStreamReceiver.getTagAllocation()->getGpuAddress(), 0, + 0, 0, 0, 0, 0, 0, 0); + auto tag = mockTagAllocator->getTag(); + blitProperties.multiRootDeviceEventSync = tag; + + BlitPropertiesContainer container; + container.push_back(blitProperties); + commandStreamReceiver.flushBcsTask(container, true, false, *pDevice); + HardwareParse hwParser; + hwParser.parseCommands(commandStreamReceiver.commandStream, 0); + + auto cmdIterator = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); + bool nodeAddressFound = false; + while (cmdIterator != hwParser.cmdList.end()) { + auto flush = genCmdCast(*cmdIterator); + if (flush->getDestinationAddress() == tag->getGpuAddress() + tag->getContextEndOffset()) { + nodeAddressFound = true; + break; + } + cmdIterator = find(++cmdIterator, hwParser.cmdList.end()); + } + EXPECT_TRUE(nodeAddressFound); +} +HWTEST_F(CommandStreamReceiverHwTest, givenNullPtrAsMultiRootDeviceSyncNodeWhenFlushBcsTAskThenMiFlushNotAdded) { + using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW; + auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); + auto mockTagAllocator = std::make_unique>(pDevice->getRootDeviceIndex(), pDevice->getExecutionEnvironment()->memoryManager.get(), 10u); + + auto blitProperties = BlitProperties::constructPropertiesForReadWrite(BlitterConstants::BlitDirection::BufferToHostPtr, + commandStreamReceiver, commandStreamReceiver.getTagAllocation(), nullptr, + commandStreamReceiver.getTagAllocation()->getUnderlyingBuffer(), + commandStreamReceiver.getTagAllocation()->getGpuAddress(), 0, + 0, 0, 0, 0, 0, 0, 0); + auto tag = mockTagAllocator->getTag(); + + BlitPropertiesContainer container; + container.push_back(blitProperties); + commandStreamReceiver.flushBcsTask(container, true, false, *pDevice); + HardwareParse hwParser; + hwParser.parseCommands(commandStreamReceiver.commandStream, 0); + + auto cmdIterator = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); + bool nodeAddressFound = false; + while (cmdIterator != hwParser.cmdList.end()) { + auto flush = genCmdCast(*cmdIterator); + if (flush->getDestinationAddress() == tag->getGpuAddress() + tag->getContextEndOffset()) { + nodeAddressFound = true; + break; + } + cmdIterator = find(++cmdIterator, hwParser.cmdList.end()); + } + EXPECT_FALSE(nodeAddressFound); +} \ No newline at end of file diff --git a/shared/test/unit_test/helpers/blit_commands_helper_tests.cpp b/shared/test/unit_test/helpers/blit_commands_helper_tests.cpp index e0df0e40ff..1575ecc7eb 100644 --- a/shared/test/unit_test/helpers/blit_commands_helper_tests.cpp +++ b/shared/test/unit_test/helpers/blit_commands_helper_tests.cpp @@ -15,6 +15,7 @@ #include "shared/test/common/helpers/debug_manager_state_restore.h" #include "shared/test/common/helpers/default_hw_info.h" #include "shared/test/common/mocks/mock_graphics_allocation.h" +#include "shared/test/common/mocks/mock_timestamp_container.h" #include "shared/test/common/mocks/ult_device_factory.h" #include "shared/test/common/test_macros/test_checks_shared.h" @@ -663,3 +664,25 @@ HWTEST2_F(BlitTests, givenPlatformWhenCallingDispatchPreBlitCommandThenNoneMiFlu auto cmdIterator = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); ASSERT_EQ(hwParser.cmdList.end(), cmdIterator); } + +HWTEST_F(BlitTests, givenPlatformWhenCallingDispatchPreBlitCommandThenNoneMiFlushDwIsProgramed) { + auto mockTagAllocator = std::make_unique>(pDevice->getRootDeviceIndex(), pDevice->getExecutionEnvironment()->memoryManager.get(), 10u); + auto tag = mockTagAllocator->getTag(); + BlitProperties blitProperties{}; + blitProperties.copySize = {1, 1, 1}; + BlitPropertiesContainer blitPropertiesContainer1; + blitPropertiesContainer1.push_back(blitProperties); + blitPropertiesContainer1.push_back(blitProperties); + blitPropertiesContainer1.push_back(blitProperties); + + auto estimatedSizeWithoutNode = BlitCommandsHelper::estimateBlitCommandsSize( + blitPropertiesContainer1, false, true, false, pDevice->getRootDeviceEnvironment()); + blitProperties.multiRootDeviceEventSync = tag; + BlitPropertiesContainer blitPropertiesContainer2; + blitPropertiesContainer2.push_back(blitProperties); + blitPropertiesContainer2.push_back(blitProperties); + blitPropertiesContainer2.push_back(blitProperties); + auto estimatedSizeWithNode = BlitCommandsHelper::estimateBlitCommandsSize( + blitPropertiesContainer2, false, true, false, pDevice->getRootDeviceEnvironment()); + EXPECT_NE(estimatedSizeWithoutNode, estimatedSizeWithNode); +} \ No newline at end of file diff --git a/shared/test/unit_test/helpers/timestamp_packet_tests.cpp b/shared/test/unit_test/helpers/timestamp_packet_tests.cpp index 5cfe1675d4..6af9bae1e9 100644 --- a/shared/test/unit_test/helpers/timestamp_packet_tests.cpp +++ b/shared/test/unit_test/helpers/timestamp_packet_tests.cpp @@ -301,3 +301,35 @@ HWTEST_F(DeviceTimestampPacketTests, givenDebugFlagSetWhenCreatingTimestampPacke EXPECT_FALSE(tag->canBeReleased()); } + +using TimestampPacketHelperTests = Test; + +HWTEST_F(TimestampPacketHelperTests, givenTagNodesInMultiRootSyncContainerWhenProgramingDependensiecThenSemaforesAreProgrammed) { + StackVec buffer(4096); + LinearStream cmdStream(buffer.begin(), buffer.size()); + CsrDependencies deps; + auto mockTagAllocator = std::make_unique>(0, pDevice->getMemoryManager()); + TimestampPacketContainer container = {}; + container.add(mockTagAllocator->getTag()); + deps.multiRootTimeStampSyncContainer.push_back(&container); + TimestampPacketHelper::programCsrDependenciesForForMultiRootDeviceSyncContainer(cmdStream, deps); + EXPECT_EQ(cmdStream.getUsed(), sizeof(typename FamilyType::MI_SEMAPHORE_WAIT)); +} + +HWTEST_F(TimestampPacketHelperTests, givenEmptyContainerMultiRootSyncContainerWhenProgramingDependensiecThenZeroSemaforesAreProgrammed) { + StackVec buffer(4096); + LinearStream cmdStream(buffer.begin(), buffer.size()); + CsrDependencies deps; + TimestampPacketContainer container = {}; + deps.multiRootTimeStampSyncContainer.push_back(&container); + TimestampPacketHelper::programCsrDependenciesForForMultiRootDeviceSyncContainer(cmdStream, deps); + EXPECT_EQ(cmdStream.getUsed(), 0u); +} + +HWTEST_F(TimestampPacketHelperTests, givenEmptyMultiRootSyncContainerWhenProgramingDependensiecThenZeroSemaforesAreProgrammed) { + StackVec buffer(4096); + LinearStream cmdStream(buffer.begin(), buffer.size()); + CsrDependencies deps; + TimestampPacketHelper::programCsrDependenciesForForMultiRootDeviceSyncContainer(cmdStream, deps); + EXPECT_EQ(cmdStream.getUsed(), 0u); +} \ No newline at end of file