From b01b8ba5ac795b8950b4521643d9ed0abae429b6 Mon Sep 17 00:00:00 2001 From: Krzysztof Gibala Date: Thu, 11 Mar 2021 13:48:04 +0000 Subject: [PATCH] Use MI_SEMAPHORE_WAIT command for event synchronization Related-To: NEO-5508 Signed-off-by: Krzysztof Gibala --- opencl/source/command_queue/command_queue.cpp | 40 --- opencl/source/command_queue/command_queue.h | 2 - opencl/source/command_queue/enqueue_common.h | 58 ++-- .../command_queue/gpgpu_walker_base.inl | 3 + .../command_queue/hardware_interface_base.inl | 2 +- opencl/source/helpers/properties_helper.cpp | 26 +- opencl/source/helpers/properties_helper.h | 3 +- opencl/source/helpers/task_information.cpp | 14 +- ...and_stream_receiver_flush_task_3_tests.cpp | 297 +++++++++++++++--- .../command_stream_receiver_hw_1_tests.cpp | 4 +- .../command_stream_receiver_hw_2_tests.cpp | 14 +- .../helpers/timestamp_packet_tests.cpp | 42 +-- .../command_stream_receiver_hw_base.inl | 4 +- shared/source/command_stream/csr_deps.cpp | 4 +- shared/source/command_stream/csr_deps.h | 7 +- .../source/helpers/blit_commands_helper.cpp | 12 +- shared/source/helpers/timestamp_packet.h | 28 +- 17 files changed, 395 insertions(+), 165 deletions(-) diff --git a/opencl/source/command_queue/command_queue.cpp b/opencl/source/command_queue/command_queue.cpp index 2e639f8e7f..0bde1a783c 100644 --- a/opencl/source/command_queue/command_queue.cpp +++ b/opencl/source/command_queue/command_queue.cpp @@ -585,46 +585,6 @@ bool CommandQueue::validateCapabilityForOperation(cl_command_queue_capabilities_ return operationValid && waitListValid && outEventValid; } -void CommandQueue::waitForEventsFromDifferentRootDeviceIndex(cl_uint numEventsInWaitList, const cl_event *eventWaitList, - StackVec &waitListCurrentRootDeviceIndex, bool &isEventWaitListFromPreviousRootDevice) { - isEventWaitListFromPreviousRootDevice = false; - - for (auto &rootDeviceIndex : context->getRootDeviceIndices()) { - CommandQueue *commandQueuePreviousRootDevice = nullptr; - auto maxTaskCountPreviousRootDevice = 0u; - - if (this->getDevice().getRootDeviceIndex() != rootDeviceIndex) { - for (auto eventId = 0u; eventId < numEventsInWaitList; eventId++) { - auto event = castToObject(eventWaitList[eventId]); - - if (event->getCommandQueue() && event->getCommandQueue()->getDevice().getRootDeviceIndex() == rootDeviceIndex) { - maxTaskCountPreviousRootDevice = std::max(maxTaskCountPreviousRootDevice, event->peekTaskCount()); - commandQueuePreviousRootDevice = event->getCommandQueue(); - isEventWaitListFromPreviousRootDevice = true; - } - } - - if (maxTaskCountPreviousRootDevice) { - commandQueuePreviousRootDevice->getCommandStreamReceiver(false).waitForCompletionWithTimeout(false, 0, maxTaskCountPreviousRootDevice); - } - } - } - - if (isEventWaitListFromPreviousRootDevice) { - for (auto eventId = 0u; eventId < numEventsInWaitList; eventId++) { - auto event = castToObject(eventWaitList[eventId]); - - if (event->getCommandQueue()) { - if (event->getCommandQueue()->getDevice().getRootDeviceIndex() == this->getDevice().getRootDeviceIndex()) { - waitListCurrentRootDeviceIndex.push_back(static_cast(eventWaitList[eventId])); - } - } else { - waitListCurrentRootDeviceIndex.push_back(static_cast(eventWaitList[eventId])); - } - } - } -} - cl_uint CommandQueue::getQueueFamilyIndex() const { if (isQueueFamilySelected()) { return queueFamilyIndex; diff --git a/opencl/source/command_queue/command_queue.h b/opencl/source/command_queue/command_queue.h index 2182c03340..7090c5f53f 100644 --- a/opencl/source/command_queue/command_queue.h +++ b/opencl/source/command_queue/command_queue.h @@ -303,8 +303,6 @@ class CommandQueue : public BaseObject<_cl_command_queue> { bool validateCapability(cl_command_queue_capabilities_intel capability) const; bool validateCapabilitiesForEventWaitList(cl_uint numEventsInWaitList, const cl_event *waitList) const; bool validateCapabilityForOperation(cl_command_queue_capabilities_intel capability, cl_uint numEventsInWaitList, const cl_event *waitList, const cl_event *outEvent) const; - void waitForEventsFromDifferentRootDeviceIndex(cl_uint numEventsInWaitList, const cl_event *eventWaitList, - StackVec &waitListCurrentRootDeviceIndex, bool &isEventWaitListFromPreviousRootDevice); cl_uint getQueueFamilyIndex() const; cl_uint getQueueIndexWithinFamily() const { return queueIndexWithinFamily; } bool isQueueFamilySelected() const { return queueFamilySelected; } diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h index 629f7b23aa..b40506f0e6 100644 --- a/opencl/source/command_queue/enqueue_common.h +++ b/opencl/source/command_queue/enqueue_common.h @@ -159,16 +159,6 @@ void CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, return; } - StackVec waitListCurrentRootDeviceIndex; - bool isEventWaitListFromPreviousRootDevice = false; - - if (context->getRootDeviceIndices().size() > 1u) { - waitForEventsFromDifferentRootDeviceIndex(numEventsInWaitList, eventWaitList, waitListCurrentRootDeviceIndex, isEventWaitListFromPreviousRootDevice); - } - - const cl_event *eventWaitListCurrentRootDevice = isEventWaitListFromPreviousRootDevice ? waitListCurrentRootDeviceIndex.data() : eventWaitList; - cl_uint numEventsInWaitListCurrentRootDevice = isEventWaitListFromPreviousRootDevice ? static_cast(waitListCurrentRootDeviceIndex.size()) : numEventsInWaitList; - Kernel *parentKernel = multiDispatchInfo.peekParentKernel(); auto devQueue = this->getContext().getDefaultDeviceQueue(); DeviceQueueHw *devQueueHw = castToObject>(devQueue); @@ -187,7 +177,7 @@ void CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, auto blockQueue = false; auto taskLevel = 0u; - obtainTaskLevelAndBlockedStatus(taskLevel, numEventsInWaitListCurrentRootDevice, eventWaitListCurrentRootDevice, blockQueue, commandType); + obtainTaskLevelAndBlockedStatus(taskLevel, numEventsInWaitList, eventWaitList, blockQueue, commandType); if (parentKernel && !blockQueue) { while (!devQueueHw->isEMCriticalSectionFree()) @@ -203,14 +193,16 @@ void CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, } TimestampPacketDependencies timestampPacketDependencies; - EventsRequest eventsRequest(numEventsInWaitListCurrentRootDevice, eventWaitListCurrentRootDevice, event); + EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, event); CsrDependencies csrDeps; BlitPropertiesContainer blitPropertiesContainer; + eventsRequest.fillCsrDependenciesForTaskCountContainer(csrDeps, getGpgpuCommandStreamReceiver()); + bool enqueueWithBlitAuxTranslation = isBlitAuxTranslationRequired(multiDispatchInfo); if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { - eventsRequest.fillCsrDependencies(csrDeps, getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr); + eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr); auto allocator = getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(); size_t nodesCount = 0u; @@ -227,7 +219,7 @@ void CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, if (nodesCount > 0) { obtainNewTimestampPacketNodes(nodesCount, timestampPacketDependencies.previousEnqueueNodes, clearAllDependencies, false); - csrDeps.push_back(×tampPacketDependencies.previousEnqueueNodes); + csrDeps.timestampPacketContainer.push_back(×tampPacketDependencies.previousEnqueueNodes); } } @@ -235,6 +227,8 @@ void CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, blockedCommandsData, surfacesForResidency, numSurfaceForResidency); auto commandStreamStart = commandStream.getUsed(); + TimestampPacketHelper::programCsrDependenciesForForTaskCountContainer(commandStream, csrDeps); + if (enqueueWithBlitAuxTranslation) { processDispatchForBlitAuxTranslation(multiDispatchInfo, blitPropertiesContainer, timestampPacketDependencies, eventsRequest, blockQueue); @@ -269,7 +263,10 @@ void CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, } } if (flushDependenciesForNonKernelCommand) { - TimestampPacketHelper::programCsrDependencies(commandStream, csrDeps, getGpgpuCommandStreamReceiver().getOsContext().getNumSupportedDevices()); + TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer( + commandStream, + csrDeps, + getGpgpuCommandStreamReceiver().getOsContext().getNumSupportedDevices()); } } @@ -325,10 +322,10 @@ void CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, auto maxTaskCountCurrentRootDevice = this->taskCount; - for (auto eventId = 0u; eventId < numEventsInWaitListCurrentRootDevice; eventId++) { - auto event = castToObject(eventWaitListCurrentRootDevice[eventId]); + for (auto eventId = 0u; eventId < numEventsInWaitList; eventId++) { + auto event = castToObject(eventWaitList[eventId]); - if (!event->isUserEvent() && !event->isExternallySynchronized()) { + if (event->getCommandQueue() && event->getCommandQueue()->getDevice().getRootDeviceIndex() == this->getDevice().getRootDeviceIndex()) { maxTaskCountCurrentRootDevice = std::max(maxTaskCountCurrentRootDevice, event->peekTaskCount()); } } @@ -467,12 +464,12 @@ BlitProperties CommandQueueHw::processDispatchForBlitEnqueue(const Mu auto blitProperties = ClBlitProperties::constructProperties(blitDirection, *blitCommandStreamReceiver, multiDispatchInfo.peekBuiltinOpParams()); if (!queueBlocked) { - eventsRequest.fillCsrDependencies(blitProperties.csrDependencies, *blitCommandStreamReceiver, - CsrDependencies::DependenciesType::All); + eventsRequest.fillCsrDependenciesForTimestampPacketContainer(blitProperties.csrDependencies, *blitCommandStreamReceiver, + CsrDependencies::DependenciesType::All); - blitProperties.csrDependencies.push_back(×tampPacketDependencies.cacheFlushNodes); - blitProperties.csrDependencies.push_back(×tampPacketDependencies.previousEnqueueNodes); - blitProperties.csrDependencies.push_back(×tampPacketDependencies.barrierNodes); + blitProperties.csrDependencies.timestampPacketContainer.push_back(×tampPacketDependencies.cacheFlushNodes); + blitProperties.csrDependencies.timestampPacketContainer.push_back(×tampPacketDependencies.previousEnqueueNodes); + blitProperties.csrDependencies.timestampPacketContainer.push_back(×tampPacketDependencies.barrierNodes); } auto currentTimestampPacketNode = timestampPacketContainer->peekNodes().at(0); @@ -537,7 +534,7 @@ void CommandQueueHw::processDispatchForBlitAuxTranslation(const Multi if (!queueBlocked) { CsrDependencies csrDeps; - eventsRequest.fillCsrDependencies(csrDeps, *getBcsCommandStreamReceiver(), CsrDependencies::DependenciesType::All); + eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, *getBcsCommandStreamReceiver(), CsrDependencies::DependenciesType::All); BlitProperties::setupDependenciesForAuxTranslation(blitPropertiesContainer, timestampPacketDependencies, *this->timestampPacketContainer, csrDeps, getGpgpuCommandStreamReceiver(), *getBcsCommandStreamReceiver()); @@ -550,7 +547,10 @@ void CommandQueueHw::processDispatchForCacheFlush(Surface **surfaces, LinearStream *commandStream, CsrDependencies &csrDeps) { - TimestampPacketHelper::programCsrDependencies(*commandStream, csrDeps, getGpgpuCommandStreamReceiver().getOsContext().getNumSupportedDevices()); + TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer( + *commandStream, + csrDeps, + getGpgpuCommandStreamReceiver().getOsContext().getNumSupportedDevices()); uint64_t postSyncAddress = 0; if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { @@ -813,7 +813,7 @@ CompletionStamp CommandQueueHw::enqueueNonBlocked( dispatchFlags.pipelineSelectArgs.specialPipelineSelectMode = specialPipelineSelectMode; if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { - eventsRequest.fillCsrDependencies(dispatchFlags.csrDependencies, getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OutOfCsr); + eventsRequest.fillCsrDependenciesForTimestampPacketContainer(dispatchFlags.csrDependencies, getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OutOfCsr); dispatchFlags.csrDependencies.makeResident(getGpgpuCommandStreamReceiver()); } @@ -1027,7 +1027,7 @@ CompletionStamp CommandQueueHw::enqueueCommandWithoutKernel( 1u); //numDevicesInContext if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { - eventsRequest.fillCsrDependencies(dispatchFlags.csrDependencies, getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OutOfCsr); + eventsRequest.fillCsrDependenciesForTimestampPacketContainer(dispatchFlags.csrDependencies, getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OutOfCsr); dispatchFlags.csrDependencies.makeResident(getGpgpuCommandStreamReceiver()); } @@ -1106,7 +1106,7 @@ void CommandQueueHw::enqueueBlit(const MultiDispatchInfo &multiDispat BlitPropertiesContainer blitPropertiesContainer; CsrDependencies csrDeps; - eventsRequest.fillCsrDependencies(csrDeps, *getBcsCommandStreamReceiver(), CsrDependencies::DependenciesType::All); + eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, *getBcsCommandStreamReceiver(), CsrDependencies::DependenciesType::All); auto allocator = getBcsCommandStreamReceiver()->getTimestampPacketAllocator(); if (isCacheFlushForBcsRequired() && isGpgpuSubmissionForBcsRequired(blockQueue)) { @@ -1118,7 +1118,7 @@ void CommandQueueHw::enqueueBlit(const MultiDispatchInfo &multiDispat } obtainNewTimestampPacketNodes(1, timestampPacketDependencies.previousEnqueueNodes, clearAllDependencies, true); - csrDeps.push_back(×tampPacketDependencies.previousEnqueueNodes); + csrDeps.timestampPacketContainer.push_back(×tampPacketDependencies.previousEnqueueNodes); LinearStream *gpgpuCommandStream = {}; size_t gpgpuCommandStreamStart = {}; diff --git a/opencl/source/command_queue/gpgpu_walker_base.inl b/opencl/source/command_queue/gpgpu_walker_base.inl index 88e258d930..33cf255b6c 100644 --- a/opencl/source/command_queue/gpgpu_walker_base.inl +++ b/opencl/source/command_queue/gpgpu_walker_base.inl @@ -219,6 +219,9 @@ size_t EnqueueOperation::getTotalSizeRequiredCS(uint32_t eventType, c if (DebugManager.flags.GpuScratchRegWriteAfterWalker.get() != -1) { expectedSizeCS += sizeof(typename GfxFamily::MI_LOAD_REGISTER_IMM); } + + expectedSizeCS += TimestampPacketHelper::getRequiredCmdStreamSizeForTaskCountContainer(csrDeps); + return expectedSizeCS; } diff --git a/opencl/source/command_queue/hardware_interface_base.inl b/opencl/source/command_queue/hardware_interface_base.inl index 4dd685cb7b..7ef0bf4071 100644 --- a/opencl/source/command_queue/hardware_interface_base.inl +++ b/opencl/source/command_queue/hardware_interface_base.inl @@ -107,7 +107,7 @@ void HardwareInterface::dispatchWalker( } auto numSupportedDevices = commandQueue.getGpgpuCommandStreamReceiver().getOsContext().getNumSupportedDevices(); - TimestampPacketHelper::programCsrDependencies(*commandStream, csrDependencies, numSupportedDevices); + TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer(*commandStream, csrDependencies, numSupportedDevices); dsh->align(EncodeStates::alignInterfaceDescriptorData); diff --git a/opencl/source/helpers/properties_helper.cpp b/opencl/source/helpers/properties_helper.cpp index c5f3c3ec80..64db1a847f 100644 --- a/opencl/source/helpers/properties_helper.cpp +++ b/opencl/source/helpers/properties_helper.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2020 Intel Corporation + * Copyright (C) 2018-2021 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -17,7 +17,7 @@ namespace NEO { -void EventsRequest::fillCsrDependencies(CsrDependencies &csrDeps, CommandStreamReceiver ¤tCsr, CsrDependencies::DependenciesType depsType) const { +void EventsRequest::fillCsrDependenciesForTimestampPacketContainer(CsrDependencies &csrDeps, CommandStreamReceiver ¤tCsr, CsrDependencies::DependenciesType depsType) const { for (cl_uint i = 0; i < this->numEventsInWaitList; i++) { auto event = castToObjectOrAbort(this->eventWaitList[i]); if (event->isUserEvent()) { @@ -35,7 +35,26 @@ void EventsRequest::fillCsrDependencies(CsrDependencies &csrDeps, CommandStreamR (CsrDependencies::DependenciesType::All == depsType); if (pushDependency) { - csrDeps.push_back(timestampPacketContainer); + csrDeps.timestampPacketContainer.push_back(timestampPacketContainer); + } + } +} + +void EventsRequest::fillCsrDependenciesForTaskCountContainer(CsrDependencies &csrDeps, CommandStreamReceiver ¤tCsr) const { + for (cl_uint i = 0; i < this->numEventsInWaitList; i++) { + auto event = castToObjectOrAbort(this->eventWaitList[i]); + if (event->isUserEvent()) { + continue; + } + + if (event->getCommandQueue() && event->getCommandQueue()->getDevice().getRootDeviceIndex() != currentCsr.getRootDeviceIndex()) { + auto taskCountPreviousRootDevice = event->peekTaskCount(); + auto tagAddressPreviousRootDevice = event->getCommandQueue()->getCommandStreamReceiver(false).getTagAddress(); + + csrDeps.taskCountContainer.push_back({taskCountPreviousRootDevice, reinterpret_cast(tagAddressPreviousRootDevice)}); + + auto graphicsAllocation = event->getCommandQueue()->getCommandStreamReceiver(false).getTagsMultiAllocation()->getGraphicsAllocation(currentCsr.getRootDeviceIndex()); + currentCsr.getResidencyAllocations().push_back(graphicsAllocation); } } } @@ -43,7 +62,6 @@ void EventsRequest::fillCsrDependencies(CsrDependencies &csrDeps, CommandStreamR TransferProperties::TransferProperties(MemObj *memObj, cl_command_type cmdType, cl_map_flags mapFlags, bool blocking, size_t *offsetPtr, size_t *sizePtr, void *ptr, bool doTransferOnCpu, uint32_t rootDeviceIndex) : memObj(memObj), ptr(ptr), cmdType(cmdType), mapFlags(mapFlags), blocking(blocking), doTransferOnCpu(doTransferOnCpu) { - // no size or offset passed for unmap operation if (cmdType != CL_COMMAND_UNMAP_MEM_OBJECT) { if (memObj->peekClMemObjType() == CL_MEM_OBJECT_BUFFER) { diff --git a/opencl/source/helpers/properties_helper.h b/opencl/source/helpers/properties_helper.h index d1534dc752..11d84d2b60 100644 --- a/opencl/source/helpers/properties_helper.h +++ b/opencl/source/helpers/properties_helper.h @@ -24,7 +24,8 @@ struct EventsRequest { EventsRequest(cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *outEvent) : numEventsInWaitList(numEventsInWaitList), eventWaitList(eventWaitList), outEvent(outEvent) {} - void fillCsrDependencies(CsrDependencies &csrDeps, CommandStreamReceiver ¤tCsr, CsrDependencies::DependenciesType depsType) const; + void fillCsrDependenciesForTimestampPacketContainer(CsrDependencies &csrDeps, CommandStreamReceiver ¤tCsr, CsrDependencies::DependenciesType depsType) const; + void fillCsrDependenciesForTaskCountContainer(CsrDependencies &csrDeps, CommandStreamReceiver ¤tCsr) const; cl_uint numEventsInWaitList; const cl_event *eventWaitList; diff --git a/opencl/source/helpers/task_information.cpp b/opencl/source/helpers/task_information.cpp index ab21b23e29..77d5030497 100644 --- a/opencl/source/helpers/task_information.cpp +++ b/opencl/source/helpers/task_information.cpp @@ -205,7 +205,7 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate if (kernelOperation->blitPropertiesContainer.size() > 0) { auto &bcsCsr = *commandQueue.getBcsCommandStreamReceiver(); CsrDependencies csrDeps; - eventsRequest.fillCsrDependencies(csrDeps, bcsCsr, CsrDependencies::DependenciesType::All); + eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, bcsCsr, CsrDependencies::DependenciesType::All); BlitProperties::setupDependenciesForAuxTranslation(kernelOperation->blitPropertiesContainer, *timestampPacketDependencies, *currentTimestampPacketNodes, csrDeps, @@ -246,7 +246,7 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate kernel->areMultipleSubDevicesInContext()); //areMultipleSubDevicesInContext if (timestampPacketDependencies) { - eventsRequest.fillCsrDependencies(dispatchFlags.csrDependencies, commandStreamReceiver, CsrDependencies::DependenciesType::OutOfCsr); + eventsRequest.fillCsrDependenciesForTimestampPacketContainer(dispatchFlags.csrDependencies, commandStreamReceiver, CsrDependencies::DependenciesType::OutOfCsr); dispatchFlags.barrierTimestampPacketNodes = ×tampPacketDependencies->barrierNodes; } dispatchFlags.pipelineSelectArgs.specialPipelineSelectMode = kernel->requiresSpecialPipelineSelectMode(); @@ -303,10 +303,10 @@ void CommandWithoutKernel::dispatchBlitOperation() { UNRECOVERABLE_IF(kernelOperation->blitPropertiesContainer.size() != 1); auto &blitProperties = *kernelOperation->blitPropertiesContainer.begin(); - eventsRequest.fillCsrDependencies(blitProperties.csrDependencies, *bcsCsr, CsrDependencies::DependenciesType::All); - blitProperties.csrDependencies.push_back(×tampPacketDependencies->cacheFlushNodes); - blitProperties.csrDependencies.push_back(×tampPacketDependencies->previousEnqueueNodes); - blitProperties.csrDependencies.push_back(×tampPacketDependencies->barrierNodes); + eventsRequest.fillCsrDependenciesForTimestampPacketContainer(blitProperties.csrDependencies, *bcsCsr, CsrDependencies::DependenciesType::All); + blitProperties.csrDependencies.timestampPacketContainer.push_back(×tampPacketDependencies->cacheFlushNodes); + blitProperties.csrDependencies.timestampPacketContainer.push_back(×tampPacketDependencies->previousEnqueueNodes); + blitProperties.csrDependencies.timestampPacketContainer.push_back(×tampPacketDependencies->barrierNodes); blitProperties.outputTimestampPacket = currentTimestampPacketNodes->peekNodes()[0]; auto bcsTaskCount = bcsCsr->blitBuffer(kernelOperation->blitPropertiesContainer, false, commandQueue.isProfilingEnabled()); @@ -372,7 +372,7 @@ CompletionStamp &CommandWithoutKernel::submit(uint32_t taskLevel, bool terminate UNRECOVERABLE_IF(!kernelOperation->blitEnqueue && !commandStreamReceiver.peekTimestampPacketWriteEnabled()); - eventsRequest.fillCsrDependencies(dispatchFlags.csrDependencies, commandStreamReceiver, CsrDependencies::DependenciesType::OutOfCsr); + eventsRequest.fillCsrDependenciesForTimestampPacketContainer(dispatchFlags.csrDependencies, commandStreamReceiver, CsrDependencies::DependenciesType::OutOfCsr); makeTimestampPacketsResident(commandStreamReceiver); gtpinNotifyPreFlushTask(&commandQueue); diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_3_tests.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_3_tests.cpp index 2e0af0a513..c201fd0278 100644 --- a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_3_tests.cpp +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_3_tests.cpp @@ -29,6 +29,8 @@ #include "opencl/test/unit_test/mocks/mock_platform.h" #include "opencl/test/unit_test/mocks/mock_program.h" #include "opencl/test/unit_test/mocks/mock_submissions_aggregator.h" +#include "opencl/test/unit_test/mocks/mock_svm_manager.h" +#include "opencl/test/unit_test/test_macros/test_checks_ocl.h" #include "test.h" using namespace NEO; @@ -1944,7 +1946,11 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, GivenGpuIsIdleWhenCsrIsEnabledToFl *commandStreamReceiver.getTagAddress() = 2u; } -TEST(MultiRootDeviceCommandStreamReceiverTests, givenMultipleEventInMultiRootDeviceEnvironmentWhenTheyArePassedToMarkerThenCsrsAreWaitingForEventsFromPreviousDevices) { +using MultiRootDeviceCommandStreamReceiverTests = CommandStreamReceiverFlushTaskTests; + +HWTEST_F(MultiRootDeviceCommandStreamReceiverTests, givenMultipleEventInMultiRootDeviceEnvironmentWhenTheyArePassedToEnqueueWithoutSubmissionThenCsIsWaitingForEventsFromPreviousDevices) { + using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + auto deviceFactory = std::make_unique(4, 0); auto device1 = deviceFactory->rootDevices[1]; auto device2 = deviceFactory->rootDevices[2]; @@ -1968,11 +1974,10 @@ TEST(MultiRootDeviceCommandStreamReceiverTests, givenMultipleEventInMultiRootDev Event event1(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 5, 15); Event event2(nullptr, CL_COMMAND_NDRANGE_KERNEL, 6, 16); - Event event3(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 1, 6); - Event event4(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 4, 20); - Event event5(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 3, 4); - Event event6(pCmdQ3, CL_COMMAND_NDRANGE_KERNEL, 7, 21); - Event event7(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 2, 7); + Event event3(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 4, 20); + Event event4(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 3, 4); + Event event5(pCmdQ3, CL_COMMAND_NDRANGE_KERNEL, 7, 21); + Event event6(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 2, 7); UserEvent userEvent1(&pCmdQ1->getContext()); UserEvent userEvent2(&pCmdQ2->getContext()); @@ -1987,42 +1992,34 @@ TEST(MultiRootDeviceCommandStreamReceiverTests, givenMultipleEventInMultiRootDev &event4, &event5, &event6, - &event7, &userEvent1, &userEvent2, }; - cl_uint numEventsInWaitList = sizeof(eventWaitList) / sizeof(eventWaitList[0]); { - cl_event eventWaitList[] = - { - &event1, - &event3, - &event4, - }; - - cl_uint numEventsInWaitList = sizeof(eventWaitList) / sizeof(eventWaitList[0]); - pCmdQ1->enqueueMarkerWithWaitList( numEventsInWaitList, eventWaitList, nullptr); - EXPECT_EQ(0u, mockCsr1->waitForCompletionWithTimeoutCalled); - EXPECT_EQ(0u, mockCsr2->waitForCompletionWithTimeoutCalled); - EXPECT_EQ(0u, mockCsr3->waitForCompletionWithTimeoutCalled); - } + HardwareParse csHwParser; + csHwParser.parseCommands(pCmdQ1->getCS(0)); + auto semaphores = findAll(csHwParser.cmdList.begin(), csHwParser.cmdList.end()); - { - pCmdQ1->enqueueMarkerWithWaitList( - numEventsInWaitList, - eventWaitList, - nullptr); + EXPECT_EQ(3u, semaphores.size()); - EXPECT_EQ(0u, mockCsr1->waitForCompletionWithTimeoutCalled); - EXPECT_EQ(1u, mockCsr2->waitForCompletionWithTimeoutCalled); - EXPECT_EQ(1u, mockCsr3->waitForCompletionWithTimeoutCalled); + auto semaphoreCmd0 = genCmdCast(*(semaphores[0])); + EXPECT_EQ(4u, semaphoreCmd0->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(pCmdQ2->getCommandStreamReceiver(false).getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress()); + + auto semaphoreCmd1 = genCmdCast(*(semaphores[1])); + EXPECT_EQ(21u, semaphoreCmd1->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(pCmdQ3->getCommandStreamReceiver(false).getTagAddress()), semaphoreCmd1->getSemaphoreGraphicsAddress()); + + auto semaphoreCmd2 = genCmdCast(*(semaphores[2])); + EXPECT_EQ(7u, semaphoreCmd2->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(pCmdQ2->getCommandStreamReceiver(false).getTagAddress()), semaphoreCmd2->getSemaphoreGraphicsAddress()); } { @@ -2031,20 +2028,250 @@ TEST(MultiRootDeviceCommandStreamReceiverTests, givenMultipleEventInMultiRootDev eventWaitList, nullptr); - EXPECT_EQ(1u, mockCsr1->waitForCompletionWithTimeoutCalled); - EXPECT_EQ(1u, mockCsr2->waitForCompletionWithTimeoutCalled); - EXPECT_EQ(2u, mockCsr3->waitForCompletionWithTimeoutCalled); + HardwareParse csHwParser; + csHwParser.parseCommands(pCmdQ2->getCS(0)); + auto semaphores = findAll(csHwParser.cmdList.begin(), csHwParser.cmdList.end()); + + EXPECT_EQ(3u, semaphores.size()); + + auto semaphoreCmd0 = genCmdCast(*(semaphores[0])); + EXPECT_EQ(15u, semaphoreCmd0->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(pCmdQ1->getCommandStreamReceiver(false).getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress()); + + auto semaphoreCmd1 = genCmdCast(*(semaphores[1])); + EXPECT_EQ(20u, semaphoreCmd1->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(pCmdQ1->getCommandStreamReceiver(false).getTagAddress()), semaphoreCmd1->getSemaphoreGraphicsAddress()); + + auto semaphoreCmd2 = genCmdCast(*(semaphores[2])); + EXPECT_EQ(21u, semaphoreCmd2->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(pCmdQ3->getCommandStreamReceiver(false).getTagAddress()), semaphoreCmd2->getSemaphoreGraphicsAddress()); } { + cl_event eventWaitList[] = + { + &event1, + &event2, + &event5, + &userEvent1, + }; + cl_uint numEventsInWaitList = sizeof(eventWaitList) / sizeof(eventWaitList[0]); + pCmdQ3->enqueueMarkerWithWaitList( numEventsInWaitList, eventWaitList, nullptr); - EXPECT_EQ(2u, mockCsr1->waitForCompletionWithTimeoutCalled); - EXPECT_EQ(2u, mockCsr2->waitForCompletionWithTimeoutCalled); - EXPECT_EQ(2u, mockCsr3->waitForCompletionWithTimeoutCalled); + HardwareParse csHwParser; + csHwParser.parseCommands(pCmdQ3->getCS(0)); + auto semaphores = findAll(csHwParser.cmdList.begin(), csHwParser.cmdList.end()); + + EXPECT_EQ(1u, semaphores.size()); + + auto semaphoreCmd0 = genCmdCast(*(semaphores[0])); + EXPECT_EQ(15u, semaphoreCmd0->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(pCmdQ1->getCommandStreamReceiver(false).getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress()); + } +} + +using MultiRootDeviceCommandStreamReceiverBufferTests = MultiRootDeviceFixture; + +HWTEST_F(MultiRootDeviceCommandStreamReceiverBufferTests, givenMultipleEventInMultiRootDeviceEnvironmentWhenTheyArePassedToEnqueueWithSubmissionThenCsIsWaitingForEventsFromPreviousDevices) { + REQUIRE_SVM_OR_SKIP(device1); + REQUIRE_SVM_OR_SKIP(device2); + + using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + cl_int retVal = 0; + size_t offset = 0; + size_t size = 1; + + auto pCmdQ1 = context.get()->getSpecialQueue(1u); + auto pCmdQ2 = context.get()->getSpecialQueue(2u); + + std::unique_ptr program(Program::createBuiltInFromSource("FillBufferBytes", context.get(), context.get()->getDevices(), &retVal)); + program->build(program->getDevices(), nullptr, false); + std::unique_ptr kernel(Kernel::create(program.get(), program->getKernelInfoForKernel("FillBufferBytes"), *context.get()->getDevice(0), &retVal)); + + size_t svmSize = 4096; + void *svmPtr = alignedMalloc(svmSize, MemoryConstants::pageSize); + MockGraphicsAllocation svmAlloc(svmPtr, svmSize); + + Event event1(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 5, 15); + Event event2(nullptr, CL_COMMAND_NDRANGE_KERNEL, 6, 16); + Event event3(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 4, 20); + Event event4(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 3, 4); + Event event5(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 2, 7); + UserEvent userEvent1(&pCmdQ1->getContext()); + UserEvent userEvent2(&pCmdQ2->getContext()); + + userEvent1.setStatus(CL_COMPLETE); + userEvent2.setStatus(CL_COMPLETE); + + cl_event eventWaitList[] = + { + &event1, + &event2, + &event3, + &event4, + &event5, + &userEvent1, + &userEvent2, + }; + cl_uint numEventsInWaitList = sizeof(eventWaitList) / sizeof(eventWaitList[0]); + + { + kernel->setSvmKernelExecInfo(&svmAlloc); + + retVal = pCmdQ1->enqueueKernel( + kernel.get(), + 1, + &offset, + &size, + &size, + numEventsInWaitList, + eventWaitList, + nullptr); + + HardwareParse csHwParser; + csHwParser.parseCommands(pCmdQ1->getCS(0)); + auto semaphores = findAll(csHwParser.cmdList.begin(), csHwParser.cmdList.end()); + + EXPECT_EQ(2u, semaphores.size()); + + auto semaphoreCmd0 = genCmdCast(*(semaphores[0])); + EXPECT_EQ(4u, semaphoreCmd0->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(pCmdQ2->getCommandStreamReceiver(false).getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress()); + + auto semaphoreCmd1 = genCmdCast(*(semaphores[1])); + EXPECT_EQ(7u, semaphoreCmd1->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(pCmdQ2->getCommandStreamReceiver(false).getTagAddress()), semaphoreCmd1->getSemaphoreGraphicsAddress()); + } + + { + kernel->setSvmKernelExecInfo(&svmAlloc); + + retVal = pCmdQ2->enqueueKernel( + kernel.get(), + 1, + &offset, + &size, + &size, + numEventsInWaitList, + eventWaitList, + nullptr); + + HardwareParse csHwParser; + csHwParser.parseCommands(pCmdQ2->getCS(0)); + auto semaphores = findAll(csHwParser.cmdList.begin(), csHwParser.cmdList.end()); + + EXPECT_EQ(2u, semaphores.size()); + + auto semaphoreCmd0 = genCmdCast(*(semaphores[0])); + EXPECT_EQ(15u, semaphoreCmd0->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(pCmdQ1->getCommandStreamReceiver(false).getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress()); + + auto semaphoreCmd1 = genCmdCast(*(semaphores[1])); + EXPECT_EQ(20u, semaphoreCmd1->getSemaphoreDataDword()); + EXPECT_EQ(reinterpret_cast(pCmdQ1->getCommandStreamReceiver(false).getTagAddress()), semaphoreCmd1->getSemaphoreGraphicsAddress()); + } + alignedFree(svmPtr); +} + +HWTEST_F(MultiRootDeviceCommandStreamReceiverTests, givenMultipleEventInMultiRootDeviceEnvironmentWhenTheyArePassedToMarkerThenMiSemaphoreWaitCommandSizeIsIncluded) { + using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + + auto deviceFactory = std::make_unique(3, 0); + auto device1 = deviceFactory->rootDevices[1]; + auto device2 = deviceFactory->rootDevices[2]; + + auto mockCsr1 = new MockCommandStreamReceiver(*device1->executionEnvironment, device1->getRootDeviceIndex(), device1->getDeviceBitfield()); + auto mockCsr2 = new MockCommandStreamReceiver(*device2->executionEnvironment, device2->getRootDeviceIndex(), device2->getDeviceBitfield()); + + device1->resetCommandStreamReceiver(mockCsr1); + device2->resetCommandStreamReceiver(mockCsr2); + + cl_device_id devices[] = {device1, device2}; + + auto context = std::make_unique(ClDeviceVector(devices, 2), false); + + auto pCmdQ1 = context.get()->getSpecialQueue(1u); + auto pCmdQ2 = context.get()->getSpecialQueue(2u); + + MockKernelWithInternals mockKernel(ClDeviceVector(devices, 2)); + DispatchInfo dispatchInfo; + MultiDispatchInfo multiDispatchInfo(mockKernel.mockKernel); + dispatchInfo.setKernel(mockKernel.mockKernel); + multiDispatchInfo.push(dispatchInfo); + + Event event1(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 5, 15); + Event event2(nullptr, CL_COMMAND_NDRANGE_KERNEL, 6, 16); + Event event3(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 1, 6); + Event event4(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 4, 20); + Event event5(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 3, 4); + Event event6(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 2, 7); + UserEvent userEvent1(&pCmdQ1->getContext()); + UserEvent userEvent2(&pCmdQ2->getContext()); + + userEvent1.setStatus(CL_COMPLETE); + userEvent2.setStatus(CL_COMPLETE); + + { + cl_event eventWaitList[] = + { + &event1, + &event2, + &event3, + &event4, + &userEvent1, + &userEvent2, + }; + cl_uint numEventsInWaitList = sizeof(eventWaitList) / sizeof(eventWaitList[0]); + + pCmdQ1->enqueueMarkerWithWaitList( + numEventsInWaitList, + eventWaitList, + nullptr); + + EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, nullptr); + CsrDependencies csrDeps; + eventsRequest.fillCsrDependenciesForTaskCountContainer(csrDeps, pCmdQ1->getCommandStreamReceiver(false)); + + HardwareParse csHwParser; + csHwParser.parseCommands(pCmdQ1->getCS(0)); + auto semaphores = findAll(csHwParser.cmdList.begin(), csHwParser.cmdList.end()); + + EXPECT_EQ(0u, semaphores.size()); + EXPECT_EQ(0u, TimestampPacketHelper::getRequiredCmdStreamSizeForTaskCountContainer(csrDeps)); + } + + { + cl_event eventWaitList[] = + { + &event1, + &event2, + &event3, + &event4, + &event5, + &event6, + &userEvent1, + }; + cl_uint numEventsInWaitList = sizeof(eventWaitList) / sizeof(eventWaitList[0]); + + pCmdQ2->enqueueMarkerWithWaitList( + numEventsInWaitList, + eventWaitList, + nullptr); + + EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, nullptr); + CsrDependencies csrDeps; + eventsRequest.fillCsrDependenciesForTaskCountContainer(csrDeps, pCmdQ2->getCommandStreamReceiver(false)); + + HardwareParse csHwParser; + csHwParser.parseCommands(pCmdQ2->getCS(0)); + auto semaphores = findAll(csHwParser.cmdList.begin(), csHwParser.cmdList.end()); + + EXPECT_EQ(3u, semaphores.size()); + EXPECT_EQ(3u * sizeof(MI_SEMAPHORE_WAIT), TimestampPacketHelper::getRequiredCmdStreamSizeForTaskCountContainer(csrDeps)); } } diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_1_tests.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_1_tests.cpp index 6c5b30ea91..2a0c0e0d4b 100644 --- a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_1_tests.cpp +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_1_tests.cpp @@ -804,8 +804,8 @@ HWTEST_F(BcsTests, givenBltSizeAndCsrDependenciesWhenEstimatingCommandSizeThenAd MockTimestampPacketContainer timestamp0(*csr.getTimestampPacketAllocator(), numberNodesPerContainer); MockTimestampPacketContainer timestamp1(*csr.getTimestampPacketAllocator(), numberNodesPerContainer); - csrDependencies.push_back(×tamp0); - csrDependencies.push_back(×tamp1); + csrDependencies.timestampPacketContainer.push_back(×tamp0); + csrDependencies.timestampPacketContainer.push_back(×tamp1); size_t cmdsSizePerBlit = sizeof(typename FamilyType::XY_COPY_BLT) + sizeof(typename FamilyType::MI_ARB_CHECK); diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_2_tests.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_2_tests.cpp index 3bd9f42ba4..654d0ed4c0 100644 --- a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_2_tests.cpp +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_2_tests.cpp @@ -220,8 +220,8 @@ HWTEST_F(BcsTests, givenCsrDependenciesWhenProgrammingCommandStreamThenAddSemaph MockTimestampPacketContainer timestamp0(*csr.getTimestampPacketAllocator(), numberNodesPerContainer); MockTimestampPacketContainer timestamp1(*csr.getTimestampPacketAllocator(), numberNodesPerContainer); - blitProperties.csrDependencies.push_back(×tamp0); - blitProperties.csrDependencies.push_back(×tamp1); + blitProperties.csrDependencies.timestampPacketContainer.push_back(×tamp0); + blitProperties.csrDependencies.timestampPacketContainer.push_back(×tamp1); blitBuffer(&csr, blitProperties, true); @@ -278,8 +278,8 @@ HWTEST_F(BcsTests, givenMultipleBlitPropertiesWhenDispatchingThenProgramCommands MockTimestampPacketContainer timestamp1(*csr.getTimestampPacketAllocator(), 1); MockTimestampPacketContainer timestamp2(*csr.getTimestampPacketAllocator(), 1); - blitProperties1.csrDependencies.push_back(×tamp1); - blitProperties2.csrDependencies.push_back(×tamp2); + blitProperties1.csrDependencies.timestampPacketContainer.push_back(×tamp1); + blitProperties2.csrDependencies.timestampPacketContainer.push_back(×tamp2); BlitPropertiesContainer blitPropertiesContainer; blitPropertiesContainer.push_back(blitProperties1); @@ -1248,8 +1248,8 @@ HWTEST_F(BcsTests, givenBlitterDirectSubmissionEnabledWhenProgrammingBlitterThen MockTimestampPacketContainer timestamp0(*csr.getTimestampPacketAllocator(), numberNodesPerContainer); MockTimestampPacketContainer timestamp1(*csr.getTimestampPacketAllocator(), numberNodesPerContainer); - blitProperties.csrDependencies.push_back(×tamp0); - blitProperties.csrDependencies.push_back(×tamp1); + blitProperties.csrDependencies.timestampPacketContainer.push_back(×tamp0); + blitProperties.csrDependencies.timestampPacketContainer.push_back(×tamp1); blitBuffer(&csr, blitProperties, true); @@ -1564,4 +1564,4 @@ TEST(BcsConstantsTests, givenBlitConstantsThenTheyHaveDesiredValues) { EXPECT_EQ(BlitterConstants::maxBlitHeight, 0x4000u); EXPECT_EQ(BlitterConstants::maxBlitSetWidth, 0x1FF80u); EXPECT_EQ(BlitterConstants::maxBlitSetHeight, 0x1FFC0u); -} \ No newline at end of file +} diff --git a/opencl/test/unit_test/helpers/timestamp_packet_tests.cpp b/opencl/test/unit_test/helpers/timestamp_packet_tests.cpp index 603a61658c..882b45db87 100644 --- a/opencl/test/unit_test/helpers/timestamp_packet_tests.cpp +++ b/opencl/test/unit_test/helpers/timestamp_packet_tests.cpp @@ -449,14 +449,14 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledAndOoqWhenEstimat EventsRequest eventsRequest(numEventsOnWaitlist, waitlist, nullptr); CsrDependencies csrDeps; - eventsRequest.fillCsrDependencies( + eventsRequest.fillCsrDependenciesForTimestampPacketContainer( csrDeps, device->getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr); getCommandStream(*mockCmdQHw, csrDeps, false, false, false, multiDispatchInfo, nullptr, 0); auto sizeWithEnabled = mockCmdQHw->requestedCmdStreamSize; size_t sizeForNodeDependency = 0; - for (auto timestampPacketContainer : csrDeps) { + for (auto timestampPacketContainer : csrDeps.timestampPacketContainer) { for (auto &node : timestampPacketContainer->peekNodes()) { sizeForNodeDependency += TimestampPacketHelper::getRequiredCmdStreamSizeForNodeDependency(*node); } @@ -499,13 +499,13 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledWhenEstimatingStr EventsRequest eventsRequest(numEventsOnWaitlist, waitlist, nullptr); CsrDependencies csrDeps; - eventsRequest.fillCsrDependencies(csrDeps, device->getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr); + eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, device->getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr); getCommandStream(*mockCmdQHw, csrDeps, false, false, false, multiDispatchInfo, nullptr, 0); auto sizeWithEnabled = mockCmdQHw->requestedCmdStreamSize; size_t sizeForNodeDependency = 0; - for (auto timestampPacketContainer : csrDeps) { + for (auto timestampPacketContainer : csrDeps.timestampPacketContainer) { for (auto &node : timestampPacketContainer->peekNodes()) { sizeForNodeDependency += TimestampPacketHelper::getRequiredCmdStreamSizeForNodeDependency(*node); } @@ -531,8 +531,8 @@ HWTEST_F(TimestampPacketTests, givenEventsRequestWithEventsWithoutTimestampsWhen EventsRequest eventsRequest(numEventsOnWaitlist, waitlist, nullptr); CsrDependencies csrDepsEmpty; - eventsRequest.fillCsrDependencies(csrDepsEmpty, device->getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr); - EXPECT_EQ(0u, csrDepsEmpty.size()); + eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDepsEmpty, device->getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr); + EXPECT_EQ(0u, csrDepsEmpty.timestampPacketContainer.size()); device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; MockTimestampPacketContainer timestamp1(*device->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(), 1); @@ -559,12 +559,12 @@ HWTEST_F(TimestampPacketTests, givenEventsRequestWithEventsWithoutTimestampsWhen cl_event waitlist2[] = {&event1, &eventWithEmptyTimestampContainer2, &event3, &eventWithEmptyTimestampContainer4, &event5}; EventsRequest eventsRequest2(numEventsOnWaitlist, waitlist2, nullptr); CsrDependencies csrDepsSize3; - eventsRequest2.fillCsrDependencies(csrDepsSize3, device->getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr); + eventsRequest2.fillCsrDependenciesForTimestampPacketContainer(csrDepsSize3, device->getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr); - EXPECT_EQ(3u, csrDepsSize3.size()); + EXPECT_EQ(3u, csrDepsSize3.timestampPacketContainer.size()); size_t sizeForNodeDependency = 0; - for (auto timestampPacketContainer : csrDepsSize3) { + for (auto timestampPacketContainer : csrDepsSize3.timestampPacketContainer) { for (auto &node : timestampPacketContainer->peekNodes()) { sizeForNodeDependency += TimestampPacketHelper::getRequiredCmdStreamSizeForNodeDependency(*node); } @@ -794,11 +794,11 @@ HWTEST_F(TimestampPacketTests, givenEventsRequestWhenEstimatingStreamSizeForCsrT auto sizeWithoutEvents = csr.getRequiredCmdStreamSize(flags, device->getDevice()); - eventsRequest.fillCsrDependencies(flags.csrDependencies, csr, NEO::CsrDependencies::DependenciesType::OutOfCsr); + eventsRequest.fillCsrDependenciesForTimestampPacketContainer(flags.csrDependencies, csr, NEO::CsrDependencies::DependenciesType::OutOfCsr); auto sizeWithEvents = csr.getRequiredCmdStreamSize(flags, device->getDevice()); size_t sizeForNodeDependency = 0; - for (auto timestampPacketContainer : flags.csrDependencies) { + for (auto timestampPacketContainer : flags.csrDependencies.timestampPacketContainer) { for (auto &node : timestampPacketContainer->peekNodes()) { sizeForNodeDependency += TimestampPacketHelper::getRequiredCmdStreamSizeForNodeDependency(*node); } @@ -842,11 +842,11 @@ HWTEST_F(TimestampPacketTests, givenEventsRequestWhenEstimatingStreamSizeForDiff auto sizeWithoutEvents = csr.getRequiredCmdStreamSize(flags, device->getDevice()); - eventsRequest.fillCsrDependencies(flags.csrDependencies, csr, NEO::CsrDependencies::DependenciesType::OutOfCsr); + eventsRequest.fillCsrDependenciesForTimestampPacketContainer(flags.csrDependencies, csr, NEO::CsrDependencies::DependenciesType::OutOfCsr); auto sizeWithEvents = csr.getRequiredCmdStreamSize(flags, device->getDevice()); size_t sizeForNodeDependency = 0; - for (auto timestampPacketContainer : flags.csrDependencies) { + for (auto timestampPacketContainer : flags.csrDependencies.timestampPacketContainer) { for (auto &node : timestampPacketContainer->peekNodes()) { sizeForNodeDependency += TimestampPacketHelper::getRequiredCmdStreamSizeForNodeDependency(*node); } @@ -991,8 +991,8 @@ HWTEST_F(TimestampPacketTests, givenAllDependencyTypesModeWhenFillingFromDiffere EventsRequest eventsRequest(eventsOnWaitlist, waitlist, nullptr); CsrDependencies csrDependencies; - eventsRequest.fillCsrDependencies(csrDependencies, csr1, CsrDependencies::DependenciesType::All); - EXPECT_EQ(static_cast(eventsOnWaitlist), csrDependencies.size()); + eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDependencies, csr1, CsrDependencies::DependenciesType::All); + EXPECT_EQ(static_cast(eventsOnWaitlist), csrDependencies.timestampPacketContainer.size()); } HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledOnDifferentCSRsFromOneDeviceWhenEnqueueingThenProgramSemaphoresOnCsrStream) { @@ -1177,7 +1177,7 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledWhenDispatchingTh EventsRequest eventsRequest(eventsOnWaitlist, waitlist, nullptr); CsrDependencies csrDeps; - eventsRequest.fillCsrDependencies(csrDeps, mockCmdQ->getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr); + eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, mockCmdQ->getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr); HardwareInterface::dispatchWalker( *mockCmdQ, @@ -1260,7 +1260,7 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledOnDifferentCSRsFr EventsRequest eventsRequest(eventsOnWaitlist, waitlist, nullptr); CsrDependencies csrDeps; - eventsRequest.fillCsrDependencies(csrDeps, mockCmdQ->getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr); + eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, mockCmdQ->getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr); HardwareInterface::dispatchWalker( *mockCmdQ, @@ -1769,12 +1769,12 @@ HWTEST_F(TimestampPacketTests, givenWaitlistAndOutputEventWhenEnqueueingMarkerWi verifySemaphore(genCmdCast(*(csrSemaphores[0])), node2.getNode(0), 0); auto queueSemaphores = findAll(hwParserCmdQ.cmdList.begin(), hwParserCmdQ.cmdList.end()); - auto expectedQueueSemaphoresCount = 1u; + auto expectedQueueSemaphoresCount = 2u; if (UnitTestHelper::isAdditionalMiSemaphoreWaitRequired(device->getHardwareInfo())) { expectedQueueSemaphoresCount += 2; } EXPECT_EQ(expectedQueueSemaphoresCount, queueSemaphores.size()); - verifySemaphore(genCmdCast(*(queueSemaphores[0])), node1.getNode(0), 0); + verifySemaphore(genCmdCast(*(queueSemaphores[1])), node1.getNode(0), 0); } HWTEST_F(TimestampPacketTests, givenWaitlistAndOutputEventWhenEnqueueingBarrierWithoutKernelThenInheritTimestampPacketsAndProgramSemaphores) { @@ -1812,12 +1812,12 @@ HWTEST_F(TimestampPacketTests, givenWaitlistAndOutputEventWhenEnqueueingBarrierW verifySemaphore(genCmdCast(*(csrSemaphores[0])), node2.getNode(0), 0); auto queueSemaphores = findAll(hwParserCmdQ.cmdList.begin(), hwParserCmdQ.cmdList.end()); - auto expectedQueueSemaphoresCount = 1u; + auto expectedQueueSemaphoresCount = 2u; if (UnitTestHelper::isAdditionalMiSemaphoreWaitRequired(device->getHardwareInfo())) { expectedQueueSemaphoresCount += 2; } EXPECT_EQ(expectedQueueSemaphoresCount, queueSemaphores.size()); - verifySemaphore(genCmdCast(*(queueSemaphores[0])), node1.getNode(0), 0); + verifySemaphore(genCmdCast(*(queueSemaphores[1])), node1.getNode(0), 0); } HWTEST_F(TimestampPacketTests, givenEmptyWaitlistAndNoOutputEventWhenEnqueueingMarkerThenDoNothing) { diff --git a/shared/source/command_stream/command_stream_receiver_hw_base.inl b/shared/source/command_stream/command_stream_receiver_hw_base.inl index 3bf43070d7..082e4ce127 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_base.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_base.inl @@ -327,7 +327,7 @@ CompletionStamp CommandStreamReceiverHw::flushTask( auto &commandStreamCSR = this->getCS(getRequiredCmdStreamSizeAligned(dispatchFlags, device)); auto commandStreamStartCSR = commandStreamCSR.getUsed(); - TimestampPacketHelper::programCsrDependencies(commandStreamCSR, dispatchFlags.csrDependencies, getOsContext().getNumSupportedDevices()); + TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer(commandStreamCSR, dispatchFlags.csrDependencies, getOsContext().getNumSupportedDevices()); if (stallingPipeControlOnNextFlushRequired) { programStallingPipeControlForBarrier(commandStreamCSR, dispatchFlags); @@ -1016,7 +1016,7 @@ uint32_t CommandStreamReceiverHw::blitBuffer(const BlitPropertiesCont programEnginePrologue(commandStream); for (auto &blitProperties : blitPropertiesContainer) { - TimestampPacketHelper::programCsrDependencies(commandStream, blitProperties.csrDependencies, getOsContext().getNumSupportedDevices()); + TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer(commandStream, blitProperties.csrDependencies, getOsContext().getNumSupportedDevices()); if (blitProperties.outputTimestampPacket && profilingEnabled) { BlitCommandsHelper::encodeProfilingStartMmios(commandStream, *blitProperties.outputTimestampPacket); diff --git a/shared/source/command_stream/csr_deps.cpp b/shared/source/command_stream/csr_deps.cpp index 4d72e5b8aa..e1bee606b8 100644 --- a/shared/source/command_stream/csr_deps.cpp +++ b/shared/source/command_stream/csr_deps.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2019-2020 Intel Corporation + * Copyright (C) 2019-2021 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -12,7 +12,7 @@ namespace NEO { void CsrDependencies::makeResident(CommandStreamReceiver &commandStreamReceiver) const { - for (auto ×tampPacketContainer : *this) { + for (auto ×tampPacketContainer : timestampPacketContainer) { timestampPacketContainer->makeResident(commandStreamReceiver); } } diff --git a/shared/source/command_stream/csr_deps.h b/shared/source/command_stream/csr_deps.h index 8bbb3b1d22..4e82eb36b2 100644 --- a/shared/source/command_stream/csr_deps.h +++ b/shared/source/command_stream/csr_deps.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2019-2020 Intel Corporation + * Copyright (C) 2019-2021 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -13,7 +13,7 @@ namespace NEO { class TimestampPacketContainer; class CommandStreamReceiver; -class CsrDependencies : public StackVec { +class CsrDependencies { public: enum class DependenciesType { OnCsr, @@ -21,6 +21,9 @@ class CsrDependencies : public StackVec { All }; + StackVec, 32> taskCountContainer; + StackVec timestampPacketContainer; + void makeResident(CommandStreamReceiver &commandStreamReceiver) const; }; } // namespace NEO diff --git a/shared/source/helpers/blit_commands_helper.cpp b/shared/source/helpers/blit_commands_helper.cpp index 5172a7c614..6f22666da8 100644 --- a/shared/source/helpers/blit_commands_helper.cpp +++ b/shared/source/helpers/blit_commands_helper.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2019-2020 Intel Corporation + * Copyright (C) 2019-2021 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -144,15 +144,15 @@ void BlitProperties::setupDependenciesForAuxTranslation(BlitPropertiesContainer timestampPacketDependencies.barrierNodes.add(nodesAllocator->getTag()); // wait for barrier and events before AuxToNonAux - blitPropertiesContainer[0].csrDependencies.push_back(×tampPacketDependencies.barrierNodes); + blitPropertiesContainer[0].csrDependencies.timestampPacketContainer.push_back(×tampPacketDependencies.barrierNodes); - for (auto dep : depsFromEvents) { - blitPropertiesContainer[0].csrDependencies.push_back(dep); + for (auto dep : depsFromEvents.timestampPacketContainer) { + blitPropertiesContainer[0].csrDependencies.timestampPacketContainer.push_back(dep); } // wait for NDR before NonAuxToAux - blitPropertiesContainer[numObjects].csrDependencies.push_back(×tampPacketDependencies.cacheFlushNodes); - blitPropertiesContainer[numObjects].csrDependencies.push_back(&kernelTimestamps); + blitPropertiesContainer[numObjects].csrDependencies.timestampPacketContainer.push_back(×tampPacketDependencies.cacheFlushNodes); + blitPropertiesContainer[numObjects].csrDependencies.timestampPacketContainer.push_back(&kernelTimestamps); } } // namespace NEO diff --git a/shared/source/helpers/timestamp_packet.h b/shared/source/helpers/timestamp_packet.h index 895335b6de..4536879839 100644 --- a/shared/source/helpers/timestamp_packet.h +++ b/shared/source/helpers/timestamp_packet.h @@ -6,8 +6,8 @@ */ #pragma once - #include "shared/source/command_container/command_encoder.h" +#include "shared/source/command_stream/command_stream_receiver.h" #include "shared/source/command_stream/csr_deps.h" #include "shared/source/helpers/aux_translation.h" #include "shared/source/helpers/hw_helper.h" @@ -183,14 +183,29 @@ struct TimestampPacketHelper { } template - static void programCsrDependencies(LinearStream &cmdStream, const CsrDependencies &csrDependencies, uint32_t numSupportedDevices) { - for (auto timestampPacketContainer : csrDependencies) { + static void programCsrDependenciesForTimestampPacketContainer(LinearStream &cmdStream, const CsrDependencies &csrDependencies, uint32_t numSupportedDevices) { + for (auto timestampPacketContainer : csrDependencies.timestampPacketContainer) { for (auto &node : timestampPacketContainer->peekNodes()) { TimestampPacketHelper::programSemaphoreWithImplicitDependency(cmdStream, *node, numSupportedDevices); } } } + template + static void programCsrDependenciesForForTaskCountContainer(LinearStream &cmdStream, const CsrDependencies &csrDependencies) { + auto taskCountContainer = csrDependencies.taskCountContainer; + + for (auto &[taskCountPreviousRootDevice, tagAddressPreviousRootDevice] : taskCountContainer) { + using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION; + using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT; + + EncodeSempahore::addMiSemaphoreWaitCommand(cmdStream, + static_cast(tagAddressPreviousRootDevice), + taskCountPreviousRootDevice, + COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD); + } + } + template static void programSemaphoreWithImplicitDependencyForAuxTranslation(LinearStream &cmdStream, const TimestampPacketDependencies *timestampPacketDependencies, @@ -241,7 +256,7 @@ struct TimestampPacketHelper { template static size_t getRequiredCmdStreamSize(const CsrDependencies &csrDependencies) { size_t totalCommandsSize = 0; - for (auto timestampPacketContainer : csrDependencies) { + for (auto timestampPacketContainer : csrDependencies.timestampPacketContainer) { for (auto &node : timestampPacketContainer->peekNodes()) { totalCommandsSize += getRequiredCmdStreamSizeForNodeDependency(*node); } @@ -249,6 +264,11 @@ struct TimestampPacketHelper { return totalCommandsSize; } + + template + static size_t getRequiredCmdStreamSizeForTaskCountContainer(const CsrDependencies &csrDependencies) { + return csrDependencies.taskCountContainer.size() * sizeof(typename GfxFamily::MI_SEMAPHORE_WAIT); + } }; } // namespace NEO