Remove TSP implicit dependency tracking logic

Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com>
2025-09-15 13:01:45 +08:00 · 2021-06-15 18:34:19 +00:00
parent 5770c7b8ea
commit 5af793ddc6
20 changed files with 43 additions and 355 deletions
--- a/shared/source/command_stream/command_stream_receiver_hw_base.inl
+++ b/shared/source/command_stream/command_stream_receiver_hw_base.inl
@ -315,7 +315,7 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
    auto &commandStreamCSR = this->getCS(getRequiredCmdStreamSizeAligned(dispatchFlags, device));
    auto commandStreamStartCSR = commandStreamCSR.getUsed();

-    TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer<GfxFamily>(commandStreamCSR, dispatchFlags.csrDependencies, getOsContext().getNumSupportedDevices());
+    TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer<GfxFamily>(commandStreamCSR, dispatchFlags.csrDependencies);

    if (stallingPipeControlOnNextFlushRequired) {
        programStallingPipeControlForBarrier(commandStreamCSR, dispatchFlags);
@ -1008,7 +1008,7 @@ uint32_t CommandStreamReceiverHw<GfxFamily>::blitBuffer(const BlitPropertiesCont
    programEnginePrologue(commandStream);

    for (auto &blitProperties : blitPropertiesContainer) {
-        TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer<GfxFamily>(commandStream, blitProperties.csrDependencies, getOsContext().getNumSupportedDevices());
+        TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer<GfxFamily>(commandStream, blitProperties.csrDependencies);

        if (blitProperties.outputTimestampPacket && profilingEnabled) {
            BlitCommandsHelper<GfxFamily>::encodeProfilingStartMmios(commandStream, *blitProperties.outputTimestampPacket);
--- a/shared/source/debug_settings/debug_variables_base.inl
+++ b/shared/source/debug_settings/debug_variables_base.inl
@ -261,7 +261,6 @@ DECLARE_DEBUG_VARIABLE(int32_t, EnableStaticPartitioning, -1, "Divide workload i
 DECLARE_DEBUG_VARIABLE(int32_t, UpdateTaskCountFromWait, -1, " Do not update task count after each enqueue, but send update request while wait, -1: default(disabled), 0: disabled, 1: enabled")
 DECLARE_DEBUG_VARIABLE(int32_t, DeferOsContextInitialization, -1, "-1: default, 0: create all contexts immediately, 1: defer, if possible")
 DECLARE_DEBUG_VARIABLE(int32_t, ForceHostPointerImport, -1, "-1: default, 0: disable, 1: enable, Forces the driver to import every host pointer coming into driver, WARNING this is not spec complaint.")
-DECLARE_DEBUG_VARIABLE(int32_t, DisableAtomicForPostSyncs, -1, "When enabled, post syncs are not tracked with atomics")
 DECLARE_DEBUG_VARIABLE(bool, UseMaxSimdSizeToDeduceMaxWorkgroupSize, false, "With this flag on, max workgroup size is deduced using SIMD32 instead of SIMD8, this causes the max wkg size to be 4 times bigger")
 DECLARE_DEBUG_VARIABLE(bool, ReturnRawGpuTimestamps, false, "Driver returns raw GPU tiemstamps instead of calculated ones.")
 DECLARE_DEBUG_VARIABLE(bool, ForcePerDssBackedBufferProgramming, false, "Always program per-DSS memory backed buffer in preamble")
--- a/shared/source/helpers/CMakeLists.txt
+++ b/shared/source/helpers/CMakeLists.txt
@ -102,7 +102,6 @@ set(NEO_CORE_HELPERS
    ${CMAKE_CURRENT_SOURCE_DIR}/timestamp_offsets.h
    ${CMAKE_CURRENT_SOURCE_DIR}/timestamp_packet.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/timestamp_packet.h
-    ${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}/timestamp_packet_extra.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/uint16_avx2.h
    ${CMAKE_CURRENT_SOURCE_DIR}/uint16_sse4.h
    ${CMAKE_CURRENT_SOURCE_DIR}/vec.h
--- a/shared/source/helpers/timestamp_packet.h
+++ b/shared/source/helpers/timestamp_packet.h
@ -17,7 +17,6 @@

 #include "pipe_control_args.h"

-#include <atomic>
 #include <cstdint>
 #include <vector>

@ -57,7 +56,6 @@ class TimestampPackets : public TagTypeBase {
            packet.globalEnd = 1u;
        }
        packetsUsed = 1;
-        implicitGpuDependenciesCount = 0;
    }

    void assignDataToAllTimestamps(uint32_t packetIndex, void *source) {
@ -68,7 +66,6 @@ class TimestampPackets : public TagTypeBase {
    static constexpr size_t getContextStartOffset() { return offsetof(Packet, contextStart); }
    static constexpr size_t getContextEndOffset() { return offsetof(Packet, contextEnd); }
    static constexpr size_t getGlobalEndOffset() { return offsetof(Packet, globalEnd); }
-    size_t getImplicitGpuDependenciesCountOffset() const { return ptrDiff(&implicitGpuDependenciesCount, this); }

    uint64_t getContextStartValue(uint32_t packetIndex) const { return static_cast<uint64_t>(packets[packetIndex].contextStart); }
    uint64_t getGlobalStartValue(uint32_t packetIndex) const { return static_cast<uint64_t>(packets[packetIndex].globalStart); }
@ -78,16 +75,13 @@ class TimestampPackets : public TagTypeBase {
    void setPacketsUsed(uint32_t used) { packetsUsed = used; }
    uint32_t getPacketsUsed() const { return packetsUsed; }

-    uint32_t getImplicitGpuDependenciesCount() const { return implicitGpuDependenciesCount; }
-
  protected:
    Packet packets[TimestampPacketSizeControl::preferredPacketCount];
-    uint32_t implicitGpuDependenciesCount = 0;
    uint32_t packetsUsed = 1;
 };
 #pragma pack()

-static_assert(((4 * TimestampPacketSizeControl::preferredPacketCount + 2) * sizeof(uint32_t)) == sizeof(TimestampPackets<uint32_t>),
+static_assert(((4 * TimestampPacketSizeControl::preferredPacketCount + 1) * sizeof(uint32_t)) == sizeof(TimestampPackets<uint32_t>),
              "This structure is consumed by GPU and has to follow specific restrictions for padding and size");

 class TimestampPacketContainer : public NonCopyableClass {
@ -130,44 +124,24 @@ struct TimestampPacketHelper {
        return timestampPacketNode.getGpuAddress() + timestampPacketNode.getGlobalStartOffset();
    }

-    static uint64_t getGpuDependenciesCountGpuAddress(const TagNodeBase &timestampPacketNode) {
-        return timestampPacketNode.getGpuAddress() + timestampPacketNode.getImplicitGpuDependenciesCountOffset();
-    }
-
-    static void overrideSupportedDevicesCount(uint32_t &numSupportedDevices);
-
    template <typename GfxFamily>
-    static void programSemaphoreWithImplicitDependency(LinearStream &cmdStream, TagNodeBase &timestampPacketNode, uint32_t numSupportedDevices) {
-        using MI_ATOMIC = typename GfxFamily::MI_ATOMIC;
+    static void programSemaphore(LinearStream &cmdStream, TagNodeBase &timestampPacketNode) {
        using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION;
        using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT;

        auto compareAddress = getContextEndGpuAddress(timestampPacketNode);
-        auto dependenciesCountAddress = getGpuDependenciesCountGpuAddress(timestampPacketNode);

        for (uint32_t packetId = 0; packetId < timestampPacketNode.getPacketsUsed(); packetId++) {
            uint64_t compareOffset = packetId * timestampPacketNode.getSinglePacketSize();
            EncodeSempahore<GfxFamily>::addMiSemaphoreWaitCommand(cmdStream, compareAddress + compareOffset, 1, COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD);
        }
-
-        if (DebugManager.flags.DisableAtomicForPostSyncs.get() == 0) {
-            overrideSupportedDevicesCount(numSupportedDevices);
-
-            for (uint32_t i = 0; i < numSupportedDevices; i++) {
-                timestampPacketNode.incImplicitCpuDependenciesCount();
-            }
-            EncodeAtomic<GfxFamily>::programMiAtomic(cmdStream, dependenciesCountAddress,
-                                                     MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT,
-                                                     MI_ATOMIC::DATA_SIZE::DATA_SIZE_DWORD,
-                                                     0u, 0u, 0x0u, 0x0u);
-        }
    }

    template <typename GfxFamily>
-    static void programCsrDependenciesForTimestampPacketContainer(LinearStream &cmdStream, const CsrDependencies &csrDependencies, uint32_t numSupportedDevices) {
+    static void programCsrDependenciesForTimestampPacketContainer(LinearStream &cmdStream, const CsrDependencies &csrDependencies) {
        for (auto timestampPacketContainer : csrDependencies.timestampPacketContainer) {
            for (auto &node : timestampPacketContainer->peekNodes()) {
-                TimestampPacketHelper::programSemaphoreWithImplicitDependency<GfxFamily>(cmdStream, *node, numSupportedDevices);
+                TimestampPacketHelper::programSemaphore<GfxFamily>(cmdStream, *node);
            }
        }
    }
@ -188,9 +162,9 @@ struct TimestampPacketHelper {
    }

    template <typename GfxFamily, AuxTranslationDirection auxTranslationDirection>
-    static void programSemaphoreWithImplicitDependencyForAuxTranslation(LinearStream &cmdStream,
-                                                                        const TimestampPacketDependencies *timestampPacketDependencies,
-                                                                        const HardwareInfo &hwInfo, uint32_t numSupportedDevices) {
+    static void programSemaphoreForAuxTranslation(LinearStream &cmdStream,
+                                                  const TimestampPacketDependencies *timestampPacketDependencies,
+                                                  const HardwareInfo &hwInfo) {
        auto &container = (auxTranslationDirection == AuxTranslationDirection::AuxToNonAux)
                              ? timestampPacketDependencies->auxToNonAuxNodes
                              : timestampPacketDependencies->nonAuxToAuxNodes;
@ -207,7 +181,7 @@ struct TimestampPacketHelper {
        }

        for (auto &node : container.peekNodes()) {
-            TimestampPacketHelper::programSemaphoreWithImplicitDependency<GfxFamily>(cmdStream, *node, numSupportedDevices);
+            TimestampPacketHelper::programSemaphore<GfxFamily>(cmdStream, *node);
        }
    }

@ -224,14 +198,12 @@ struct TimestampPacketHelper {

    template <typename GfxFamily>
    static size_t getRequiredCmdStreamSizeForNodeDependencyWithBlitEnqueue() {
-        return sizeof(typename GfxFamily::MI_SEMAPHORE_WAIT) + sizeof(typename GfxFamily::MI_ATOMIC);
+        return sizeof(typename GfxFamily::MI_SEMAPHORE_WAIT);
    }

    template <typename GfxFamily>
    static size_t getRequiredCmdStreamSizeForNodeDependency(TagNodeBase &timestampPacketNode) {
-        size_t totalMiSemaphoreWaitSize = timestampPacketNode.getPacketsUsed() * sizeof(typename GfxFamily::MI_SEMAPHORE_WAIT);
-
-        return totalMiSemaphoreWaitSize + sizeof(typename GfxFamily::MI_ATOMIC);
+        return (timestampPacketNode.getPacketsUsed() * sizeof(typename GfxFamily::MI_SEMAPHORE_WAIT));
    }

    template <typename GfxFamily>
--- a/shared/source/helpers/timestamp_packet_extra.cpp
+++ b/shared/source/helpers/timestamp_packet_extra.cpp
@ -1,13 +0,0 @@
-/*
- * Copyright (C) 2020 Intel Corporation
- *
- * SPDX-License-Identifier: MIT
- *
- */
-
-#include "shared/source/helpers/timestamp_packet.h"
-
-namespace NEO {
-void TimestampPacketHelper::overrideSupportedDevicesCount(uint32_t &numSupportedDevices) {
-}
-} // namespace NEO
--- a/shared/source/utilities/tag_allocator.cpp
+++ b/shared/source/utilities/tag_allocator.cpp
@ -40,8 +40,7 @@ void TagNodeBase::returnTag() {
 }

 bool TagNodeBase::canBeReleased() const {
-    return (!doNotReleaseNodes) &&
-           (getImplicitGpuDependenciesCount() == getImplicitCpuDependenciesCount());
+    return !doNotReleaseNodes;
 }

 } // namespace NEO
--- a/shared/source/utilities/tag_allocator.h
+++ b/shared/source/utilities/tag_allocator.h
@ -54,10 +54,6 @@ class TagNodeBase : public NonCopyableOrMovableClass {

    bool isProfilingCapable() const { return profilingCapable; }

-    void incImplicitCpuDependenciesCount() { implicitCpuDependenciesCount++; }
-
-    uint32_t getImplicitCpuDependenciesCount() const { return implicitCpuDependenciesCount.load(); }
-
    const TagAllocatorBase *getAllocator() const { return allocator; }

    // TagType specific calls
@ -67,7 +63,6 @@ class TagNodeBase : public NonCopyableOrMovableClass {
    virtual size_t getContextStartOffset() const = 0;
    virtual size_t getContextEndOffset() const = 0;
    virtual size_t getGlobalEndOffset() const = 0;
-    virtual size_t getImplicitGpuDependenciesCountOffset() const = 0;

    virtual uint64_t getContextStartValue(uint32_t packetIndex) const = 0;
    virtual uint64_t getGlobalStartValue(uint32_t packetIndex) const = 0;
@ -82,8 +77,6 @@ class TagNodeBase : public NonCopyableOrMovableClass {

    virtual size_t getSinglePacketSize() const = 0;

-    virtual uint32_t getImplicitGpuDependenciesCount() const = 0;
-
    virtual MetricsLibraryApi::QueryHandle_1_0 &getQueryHandleRef() const = 0;

  protected:
@ -94,7 +87,6 @@ class TagNodeBase : public NonCopyableOrMovableClass {
    MultiGraphicsAllocation *gfxAllocation = nullptr;
    uint64_t gpuAddress = 0;
    std::atomic<uint32_t> refCount{0};
-    std::atomic<uint32_t> implicitCpuDependenciesCount{0};
    bool doNotReleaseNodes = false;
    bool profilingCapable = true;

@ -112,7 +104,6 @@ class TagNode : public TagNodeBase, public IDNode<TagNode<TagType>> {

    void initialize() override {
        tagForCpuAccess->initialize();
-        implicitCpuDependenciesCount.store(0);
        setProfilingCapable(true);
    }

@ -124,7 +115,6 @@ class TagNode : public TagNodeBase, public IDNode<TagNode<TagType>> {
    size_t getContextStartOffset() const override;
    size_t getContextEndOffset() const override;
    size_t getGlobalEndOffset() const override;
-    size_t getImplicitGpuDependenciesCountOffset() const override;

    uint64_t getContextStartValue(uint32_t packetIndex) const override;
    uint64_t getGlobalStartValue(uint32_t packetIndex) const override;
@ -139,8 +129,6 @@ class TagNode : public TagNodeBase, public IDNode<TagNode<TagType>> {

    size_t getSinglePacketSize() const override;

-    uint32_t getImplicitGpuDependenciesCount() const override;
-
    MetricsLibraryApi::QueryHandle_1_0 &getQueryHandleRef() const override;
 };

--- a/shared/source/utilities/tag_allocator.inl
+++ b/shared/source/utilities/tag_allocator.inl
@ -164,15 +164,6 @@ size_t TagNode<TagType>::getGlobalEndOffset() const {
    }
 }

-template <typename TagType>
-size_t TagNode<TagType>::getImplicitGpuDependenciesCountOffset() const {
-    if constexpr (TagType::getTagNodeType() == TagNodeType::TimestampPacket) {
-        return tagForCpuAccess->getImplicitGpuDependenciesCountOffset();
-    } else {
-        UNRECOVERABLE_IF(true);
-    }
-}
-
 template <typename TagType>
 uint64_t TagNode<TagType>::getContextStartValue(uint32_t packetIndex) const {
    if constexpr (TagType::getTagNodeType() != TagNodeType::HwPerfCounter) {
@ -250,16 +241,6 @@ uint32_t TagNode<TagType>::getPacketsUsed() const {
    }
 }

-template <typename TagType>
-uint32_t TagNode<TagType>::getImplicitGpuDependenciesCount() const {
-    if constexpr (TagType::getTagNodeType() == TagNodeType::TimestampPacket) {
-        if (DebugManager.flags.DisableAtomicForPostSyncs.get() == 0) {
-            return tagForCpuAccess->getImplicitGpuDependenciesCount();
-        }
-    }
-    return 0;
-}
-
 template <typename TagType>
 size_t TagNode<TagType>::getSinglePacketSize() const {
    if constexpr (TagType::getTagNodeType() == TagNodeType::TimestampPacket) {