From cb09e50e61d23d6c9395a146bc2af0162ecbc870 Mon Sep 17 00:00:00 2001 From: Bartosz Dunajski Date: Thu, 14 May 2020 14:33:03 +0200 Subject: [PATCH] Fix CPU dependency handling for TimestampPacket Change-Id: Ia75f4ea7eea10ca84ffa4b3d92d98942804be8d0 Signed-off-by: Bartosz Dunajski --- .../built_ins/aux_translation_builtin.h | 2 +- opencl/source/command_queue/enqueue_common.h | 7 +- .../command_queue/hardware_interface_base.inl | 7 +- opencl/source/helpers/dispatch_info.h | 2 +- .../helpers/timestamp_packet_tests.cpp | 66 ++++++++++++++++++- .../command_stream_receiver_hw_base.inl | 4 +- shared/source/helpers/timestamp_packet.h | 14 ++-- 7 files changed, 83 insertions(+), 19 deletions(-) diff --git a/opencl/source/built_ins/aux_translation_builtin.h b/opencl/source/built_ins/aux_translation_builtin.h index af450ac1be..6a3403fcdb 100644 --- a/opencl/source/built_ins/aux_translation_builtin.h +++ b/opencl/source/built_ins/aux_translation_builtin.h @@ -66,7 +66,7 @@ class BuiltInOp : public BuiltinDispatchInfoBuilder using RegisteredMethodDispatcherT = RegisteredMethodDispatcher; template - static void dispatchPipeControl(LinearStream &linearStream, TimestampPacketDependencies *, const HardwareInfo &) { + static void dispatchPipeControl(LinearStream &linearStream, TimestampPacketDependencies *, const HardwareInfo &, uint32_t) { PipeControlArgs args(dcFlush); MemorySynchronizationCommands::addPipeControl(linearStream, args); } diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h index 9a0eb45a38..f19b15b8c3 100644 --- a/opencl/source/command_queue/enqueue_common.h +++ b/opencl/source/command_queue/enqueue_common.h @@ -267,7 +267,7 @@ void CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, } } if (flushDependenciesForNonKernelCommand) { - TimestampPacketHelper::programCsrDependencies(commandStream, csrDeps); + TimestampPacketHelper::programCsrDependencies(commandStream, csrDeps, getGpgpuCommandStreamReceiver().getOsContext().getNumSupportedDevices()); } } @@ -498,7 +498,8 @@ BlitProperties CommandQueueHw::processDispatchForBlitEnqueue(const Mu args); } - TimestampPacketHelper::programSemaphoreWithImplicitDependency(commandStream, *currentTimestampPacketNode); + TimestampPacketHelper::programSemaphoreWithImplicitDependency(commandStream, *currentTimestampPacketNode, + getGpgpuCommandStreamReceiver().getOsContext().getNumSupportedDevices()); return blitProperties; } @@ -547,7 +548,7 @@ void CommandQueueHw::processDispatchForCacheFlush(Surface **surfaces, LinearStream *commandStream, CsrDependencies &csrDeps) { - TimestampPacketHelper::programCsrDependencies(*commandStream, csrDeps); + TimestampPacketHelper::programCsrDependencies(*commandStream, csrDeps, getGpgpuCommandStreamReceiver().getOsContext().getNumSupportedDevices()); uint64_t postSyncAddress = 0; if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { diff --git a/opencl/source/command_queue/hardware_interface_base.inl b/opencl/source/command_queue/hardware_interface_base.inl index dfa585ae9d..d5028024d6 100644 --- a/opencl/source/command_queue/hardware_interface_base.inl +++ b/opencl/source/command_queue/hardware_interface_base.inl @@ -58,7 +58,8 @@ void HardwareInterface::dispatchWalker( commandStream = &commandQueue.getCS(0); } - TimestampPacketHelper::programCsrDependencies(*commandStream, csrDependencies); + auto numSupportedDevices = commandQueue.getGpgpuCommandStreamReceiver().getOsContext().getNumSupportedDevices(); + TimestampPacketHelper::programCsrDependencies(*commandStream, csrDependencies, numSupportedDevices); dsh->align(HardwareCommandsHelper::alignInterfaceDescriptorData); @@ -82,7 +83,7 @@ void HardwareInterface::dispatchWalker( size_t currentDispatchIndex = 0; for (auto &dispatchInfo : multiDispatchInfo) { - dispatchInfo.dispatchInitCommands(*commandStream, timestampPacketDependencies, commandQueue.getDevice().getHardwareInfo()); + dispatchInfo.dispatchInitCommands(*commandStream, timestampPacketDependencies, commandQueue.getDevice().getHardwareInfo(), numSupportedDevices); bool isMainKernel = (dispatchInfo.getKernel() == mainKernel); dispatchKernelCommands(commandQueue, dispatchInfo, commandType, *commandStream, isMainKernel, @@ -90,7 +91,7 @@ void HardwareInterface::dispatchWalker( offsetInterfaceDescriptorTable, *dsh, *ioh, *ssh); currentDispatchIndex++; - dispatchInfo.dispatchEpilogueCommands(*commandStream, timestampPacketDependencies, commandQueue.getDevice().getHardwareInfo()); + dispatchInfo.dispatchEpilogueCommands(*commandStream, timestampPacketDependencies, commandQueue.getDevice().getHardwareInfo(), numSupportedDevices); } if (mainKernel->requiresCacheFlushCommand(commandQueue)) { uint64_t postSyncAddress = 0; diff --git a/opencl/source/helpers/dispatch_info.h b/opencl/source/helpers/dispatch_info.h index ffd1dd8d39..24fb5c5778 100644 --- a/opencl/source/helpers/dispatch_info.h +++ b/opencl/source/helpers/dispatch_info.h @@ -26,7 +26,7 @@ struct TimestampPacketDependencies; class DispatchInfo { public: - using DispatchCommandMethodT = void(LinearStream &commandStream, TimestampPacketDependencies *timestampPacketDependencies, const HardwareInfo &); + using DispatchCommandMethodT = void(LinearStream &commandStream, TimestampPacketDependencies *timestampPacketDependencies, const HardwareInfo &, uint32_t); using EstimateCommandsMethodT = size_t(size_t, const HardwareInfo &, bool); DispatchInfo() = default; diff --git a/opencl/test/unit_test/helpers/timestamp_packet_tests.cpp b/opencl/test/unit_test/helpers/timestamp_packet_tests.cpp index 7a77fe26f4..f6f0adecf7 100644 --- a/opencl/test/unit_test/helpers/timestamp_packet_tests.cpp +++ b/opencl/test/unit_test/helpers/timestamp_packet_tests.cpp @@ -120,7 +120,7 @@ HWTEST_F(TimestampPacketTests, givenTagNodeWhenSemaphoreAndAtomicAreProgrammedTh mockNode.gpuAddress = 0x1230000; auto &cmdStream = mockCmdQ->getCS(0); - TimestampPacketHelper::programSemaphoreWithImplicitDependency(cmdStream, mockNode); + TimestampPacketHelper::programSemaphoreWithImplicitDependency(cmdStream, mockNode, 1); HardwareParse hwParser; hwParser.parseCommands(cmdStream, 0); @@ -141,7 +141,7 @@ HWTEST_F(TimestampPacketTests, givenDebugModeWhereAtomicsAreNotEmittedWhenComman mockNode.gpuAddress = 0x1230000; auto &cmdStream = mockCmdQ->getCS(0); - TimestampPacketHelper::programSemaphoreWithImplicitDependency(cmdStream, mockNode); + TimestampPacketHelper::programSemaphoreWithImplicitDependency(cmdStream, mockNode, 1); HardwareParse hwParser; hwParser.parseCommands(cmdStream, 0); @@ -155,6 +155,20 @@ HWTEST_F(TimestampPacketTests, givenDebugModeWhereAtomicsAreNotEmittedWhenComman EXPECT_FALSE(tag.isCompleted()); } +HWTEST_F(TimestampPacketTests, givenMultipleDeviesWhenIncrementingCpuDependenciesThenIncrementMultipleTimes) { + TimestampPacketStorage tag; + MockTagNode mockNode; + mockNode.tagForCpuAccess = &tag; + mockNode.gpuAddress = 0x1230000; + auto &cmdStream = mockCmdQ->getCS(0); + + const uint32_t numDevices = 3; + + TimestampPacketHelper::programSemaphoreWithImplicitDependency(cmdStream, mockNode, numDevices); + + EXPECT_EQ(numDevices, mockNode.getImplicitCpuDependenciesCount()); +} + HWTEST_F(TimestampPacketTests, givenTagNodeWithPacketsUsed2WhenSemaphoreAndAtomicAreProgrammedThenUseGpuAddress) { using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; using MI_ATOMIC = typename FamilyType::MI_ATOMIC; @@ -166,7 +180,7 @@ HWTEST_F(TimestampPacketTests, givenTagNodeWithPacketsUsed2WhenSemaphoreAndAtomi mockNode.gpuAddress = 0x1230000; auto &cmdStream = mockCmdQ->getCS(0); - TimestampPacketHelper::programSemaphoreWithImplicitDependency(cmdStream, mockNode); + TimestampPacketHelper::programSemaphoreWithImplicitDependency(cmdStream, mockNode, 1); HardwareParse hwParser; hwParser.parseCommands(cmdStream, 0); @@ -853,6 +867,52 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledWhenEnqueueingThe } } +HWTEST_F(TimestampPacketTests, givenMultipleDevicesOnCsrWhenIncrementingCpuDependenciesCountThenIncrementByTargetCsrDeviceCountValue) { + DeviceBitfield osContext0DeviceBitfiled = 0b011; + DeviceBitfield osContext1DeviceBitfiled = 0b1011; + auto osContext0 = std::unique_ptr(OsContext::create(nullptr, 0, osContext0DeviceBitfiled, aub_stream::EngineType::ENGINE_RCS, PreemptionMode::Disabled, false, false, false)); + auto osContext1 = std::unique_ptr(OsContext::create(nullptr, 1, osContext1DeviceBitfiled, aub_stream::EngineType::ENGINE_RCS, PreemptionMode::Disabled, false, false, false)); + EXPECT_EQ(2u, osContext0->getNumSupportedDevices()); + EXPECT_EQ(3u, osContext1->getNumSupportedDevices()); + + auto device0 = std::make_unique(Device::create(executionEnvironment, 0u)); + auto device1 = std::make_unique(Device::create(executionEnvironment, 1u)); + + device0->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; + device0->getUltCommandStreamReceiver().setupContext(*osContext0); + device1->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; + device1->getUltCommandStreamReceiver().setupContext(*osContext1); + + MockContext context0(device0.get()); + MockContext context1(device1.get()); + + auto cmdQ0 = std::make_unique>(&context0, device0.get(), nullptr); + auto cmdQ1 = std::make_unique>(&context1, device1.get(), nullptr); + + const cl_uint eventsOnWaitlist = 2; + MockTimestampPacketContainer timestamp0(*device0->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(), 1); + MockTimestampPacketContainer timestamp1(*device1->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(), 1); + + Event event0(cmdQ0.get(), 0, 0, 0); + Event event1(cmdQ1.get(), 0, 0, 0); + event0.addTimestampPacketNodes(timestamp0); + event1.addTimestampPacketNodes(timestamp1); + + cl_event waitlist[] = {&event0, &event1}; + + cmdQ0->enqueueKernel(kernel->mockKernel, 1, nullptr, gws, nullptr, eventsOnWaitlist, waitlist, nullptr); + + verifyDependencyCounterValues(event0.getTimestampPacketNodes(), osContext0->getNumSupportedDevices()); + + verifyDependencyCounterValues(event1.getTimestampPacketNodes(), osContext0->getNumSupportedDevices()); + + cmdQ1->enqueueKernel(kernel->mockKernel, 1, nullptr, gws, nullptr, eventsOnWaitlist, waitlist, nullptr); + + verifyDependencyCounterValues(event0.getTimestampPacketNodes(), osContext0->getNumSupportedDevices() + osContext1->getNumSupportedDevices()); + + verifyDependencyCounterValues(event1.getTimestampPacketNodes(), osContext0->getNumSupportedDevices() + osContext1->getNumSupportedDevices()); +} + HWTEST_F(TimestampPacketTests, givenAllDependencyTypesModeWhenFillingFromDifferentCsrsThenPushEverything) { auto device2 = std::make_unique(Device::create(executionEnvironment, 1u)); diff --git a/shared/source/command_stream/command_stream_receiver_hw_base.inl b/shared/source/command_stream/command_stream_receiver_hw_base.inl index a32f21e7e4..e74f00cb08 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_base.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_base.inl @@ -273,7 +273,7 @@ CompletionStamp CommandStreamReceiverHw::flushTask( auto &commandStreamCSR = this->getCS(getRequiredCmdStreamSizeAligned(dispatchFlags, device)); auto commandStreamStartCSR = commandStreamCSR.getUsed(); - TimestampPacketHelper::programCsrDependencies(commandStreamCSR, dispatchFlags.csrDependencies); + TimestampPacketHelper::programCsrDependencies(commandStreamCSR, dispatchFlags.csrDependencies, getOsContext().getNumSupportedDevices()); if (stallingPipeControlOnNextFlushRequired) { programStallingPipeControlForBarrier(commandStreamCSR, dispatchFlags); @@ -858,7 +858,7 @@ uint32_t CommandStreamReceiverHw::blitBuffer(const BlitPropertiesCont programEnginePrologue(commandStream); for (auto &blitProperties : blitPropertiesContainer) { - TimestampPacketHelper::programCsrDependencies(commandStream, blitProperties.csrDependencies); + TimestampPacketHelper::programCsrDependencies(commandStream, blitProperties.csrDependencies, getOsContext().getNumSupportedDevices()); BlitCommandsHelper::dispatchBlitCommandsForBuffer(blitProperties, commandStream, *this->executionEnvironment.rootDeviceEnvironments[this->rootDeviceIndex]); diff --git a/shared/source/helpers/timestamp_packet.h b/shared/source/helpers/timestamp_packet.h index 6fc8470a27..9f6b987f79 100644 --- a/shared/source/helpers/timestamp_packet.h +++ b/shared/source/helpers/timestamp_packet.h @@ -111,7 +111,7 @@ struct TimestampPacketDependencies : public NonCopyableClass { struct TimestampPacketHelper { template - static void programSemaphoreWithImplicitDependency(LinearStream &cmdStream, TagNode ×tampPacketNode) { + static void programSemaphoreWithImplicitDependency(LinearStream &cmdStream, TagNode ×tampPacketNode, uint32_t numSupportedDevices) { using MI_ATOMIC = typename GfxFamily::MI_ATOMIC; using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION; using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT; @@ -131,7 +131,9 @@ struct TimestampPacketHelper { } if (trackPostSyncDependencies) { - timestampPacketNode.incImplicitCpuDependenciesCount(); + for (uint32_t i = 0; i < numSupportedDevices; i++) { + timestampPacketNode.incImplicitCpuDependenciesCount(); + } auto miAtomic = cmdStream.getSpaceForCmd(); EncodeAtomic::programMiAtomic(miAtomic, dependenciesCountAddress, MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, @@ -140,10 +142,10 @@ struct TimestampPacketHelper { } template - static void programCsrDependencies(LinearStream &cmdStream, const CsrDependencies &csrDependencies) { + static void programCsrDependencies(LinearStream &cmdStream, const CsrDependencies &csrDependencies, uint32_t numSupportedDevices) { for (auto timestampPacketContainer : csrDependencies) { for (auto &node : timestampPacketContainer->peekNodes()) { - TimestampPacketHelper::programSemaphoreWithImplicitDependency(cmdStream, *node); + TimestampPacketHelper::programSemaphoreWithImplicitDependency(cmdStream, *node, numSupportedDevices); } } } @@ -151,7 +153,7 @@ struct TimestampPacketHelper { template static void programSemaphoreWithImplicitDependencyForAuxTranslation(LinearStream &cmdStream, const TimestampPacketDependencies *timestampPacketDependencies, - const HardwareInfo &hwInfo) { + const HardwareInfo &hwInfo, uint32_t numSupportedDevices) { auto &container = (auxTranslationDirection == AuxTranslationDirection::AuxToNonAux) ? timestampPacketDependencies->auxToNonAuxNodes : timestampPacketDependencies->nonAuxToAuxNodes; @@ -169,7 +171,7 @@ struct TimestampPacketHelper { } for (auto &node : container.peekNodes()) { - TimestampPacketHelper::programSemaphoreWithImplicitDependency(cmdStream, *node); + TimestampPacketHelper::programSemaphoreWithImplicitDependency(cmdStream, *node, numSupportedDevices); } }