Disable TimestampPacket optimizations in Aub/Tbx mode

Avoid removing semaphores and reusing returned tags Change-Id: Ic26167953c5d5a9ccceaae49f4921af11a375fab Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com>
2026-01-10 15:12:56 +08:00 · 2019-12-02 08:37:42 +01:00
parent 54f65c0243
commit 0527c9113c
16 changed files with 149 additions and 38 deletions
--- a/runtime/command_stream/command_stream_receiver.cpp
+++ b/runtime/command_stream/command_stream_receiver.cpp
@@ -416,21 +416,28 @@ bool CommandStreamReceiver::createAllocationForHostSurface(HostPtrSurface &surfa

 TagAllocator<HwTimeStamps> *CommandStreamReceiver::getEventTsAllocator() {
    if (profilingTimeStampAllocator.get() == nullptr) {
-        profilingTimeStampAllocator = std::make_unique<TagAllocator<HwTimeStamps>>(rootDeviceIndex, getMemoryManager(), getPreferredTagPoolSize(), MemoryConstants::cacheLineSize);
+        profilingTimeStampAllocator = std::make_unique<TagAllocator<HwTimeStamps>>(
+            rootDeviceIndex, getMemoryManager(), getPreferredTagPoolSize(), MemoryConstants::cacheLineSize, sizeof(HwTimeStamps), false);
    }
    return profilingTimeStampAllocator.get();
 }

 TagAllocator<HwPerfCounter> *CommandStreamReceiver::getEventPerfCountAllocator(const uint32_t tagSize) {
    if (perfCounterAllocator.get() == nullptr) {
-        perfCounterAllocator = std::make_unique<TagAllocator<HwPerfCounter>>(rootDeviceIndex, getMemoryManager(), getPreferredTagPoolSize(), MemoryConstants::cacheLineSize, tagSize);
+        perfCounterAllocator = std::make_unique<TagAllocator<HwPerfCounter>>(
+            rootDeviceIndex, getMemoryManager(), getPreferredTagPoolSize(), MemoryConstants::cacheLineSize, tagSize, false);
    }
    return perfCounterAllocator.get();
 }

 TagAllocator<TimestampPacketStorage> *CommandStreamReceiver::getTimestampPacketAllocator() {
    if (timestampPacketAllocator.get() == nullptr) {
-        timestampPacketAllocator = std::make_unique<TagAllocator<TimestampPacketStorage>>(rootDeviceIndex, getMemoryManager(), getPreferredTagPoolSize(), MemoryConstants::cacheLineSize);
+        // dont release nodes in aub/tbx mode, to avoid removing semaphores optimization or reusing returned tags
+        bool doNotReleaseNodes = (getType() > CommandStreamReceiverType::CSR_HW);
+
+        timestampPacketAllocator = std::make_unique<TagAllocator<TimestampPacketStorage>>(
+            rootDeviceIndex, getMemoryManager(), getPreferredTagPoolSize(), MemoryConstants::cacheLineSize,
+            sizeof(TimestampPacketStorage), doNotReleaseNodes);
    }
    return timestampPacketAllocator.get();
 }
--- a/runtime/command_stream/command_stream_receiver_with_aub_dump.h
+++ b/runtime/command_stream/command_stream_receiver_with_aub_dump.h
@@ -29,6 +29,13 @@ class CommandStreamReceiverWithAUBDump : public BaseCSR {
    AubSubCaptureStatus checkAndActivateAubSubCapture(const MultiDispatchInfo &dispatchInfo) override;
    void setupContext(OsContext &osContext) override;

+    CommandStreamReceiverType getType() override {
+        if (BaseCSR::getType() == CommandStreamReceiverType::CSR_TBX) {
+            return CommandStreamReceiverType::CSR_TBX_WITH_AUB;
+        }
+        return CommandStreamReceiverType::CSR_HW_WITH_AUB;
+    }
+
    std::unique_ptr<CommandStreamReceiver> aubCSR;
 };

--- a/runtime/event/hw_timestamps.h
+++ b/runtime/event/hw_timestamps.h
@@ -21,7 +21,7 @@ struct HwTimeStamps {
        GlobalCompleteTS = 0;
        ContextCompleteTS = 0;
    }
-    bool canBeReleased() const { return true; }
+    bool isCompleted() const { return true; }
    static GraphicsAllocation::AllocationType getAllocationType() {
        return GraphicsAllocation::AllocationType::PROFILING_TAG_BUFFER;
    }
--- a/runtime/event/perf_counter.h
+++ b/runtime/event/perf_counter.h
@@ -20,7 +20,7 @@ struct HwPerfCounter {
    static GraphicsAllocation::AllocationType getAllocationType() {
        return GraphicsAllocation::AllocationType::PROFILING_TAG_BUFFER;
    }
-    bool canBeReleased() const { return true; }
+    bool isCompleted() const { return true; }

    // Gpu report size is not known during compile time.
    // Such information will be provided by metrics library dll.
--- a/runtime/helpers/timestamp_packet.cpp
+++ b/runtime/helpers/timestamp_packet.cpp
@@ -33,7 +33,7 @@ void TimestampPacketContainer::resolveDependencies(bool clearAllDependencies) {
    std::vector<Node *> pendingNodes;

    for (auto node : timestampPacketNodes) {
-        if (node->tagForCpuAccess->canBeReleased() || clearAllDependencies) {
+        if (node->canBeReleased() || clearAllDependencies) {
            node->returnTag();
        } else {
            pendingNodes.push_back(node);
--- a/runtime/helpers/timestamp_packet.h
+++ b/runtime/helpers/timestamp_packet.h
@@ -42,17 +42,13 @@ struct TimestampPacketStorage {
        return GraphicsAllocation::AllocationType::TIMESTAMP_PACKET_TAG_BUFFER;
    }

-    bool canBeReleased() const {
-        return isCompleted() && implicitDependenciesCount.load() == 0;
-    }
-
    bool isCompleted() const {
        for (uint32_t i = 0; i < packetsUsed; i++) {
            if ((packets[i].contextEnd & 1) || (packets[i].globalEnd & 1)) {
                return false;
            }
        }
-        return true;
+        return implicitDependenciesCount.load() == 0;
    }

    void initialize() {
--- a/runtime/utilities/tag_allocator.h
+++ b/runtime/utilities/tag_allocator.h
@@ -36,11 +36,20 @@ struct TagNode : public IDNode<TagNode<TagType>> {
        allocator->returnTag(this);
    }

+    bool canBeReleased() const {
+        return !doNotReleaseNodes && tagForCpuAccess->isCompleted();
+    }
+
+    void setDoNotReleaseNodes(bool doNotRelease) {
+        doNotReleaseNodes = doNotRelease;
+    }
+
  protected:
    TagAllocator<TagType> *allocator = nullptr;
    GraphicsAllocation *gfxAllocation = nullptr;
    uint64_t gpuAddress = 0;
    std::atomic<uint32_t> refCount{0};
+    bool doNotReleaseNodes = false;

    template <typename TagType2>
    friend class TagAllocator;
@@ -52,10 +61,11 @@ class TagAllocator {
    using NodeType = TagNode<TagType>;

    TagAllocator(uint32_t rootDeviceIndex, MemoryManager *memMngr, size_t tagCount,
-                 size_t tagAlignment, size_t tagSize = sizeof(TagType)) : rootDeviceIndex(rootDeviceIndex),
-                                                                          memoryManager(memMngr),
-                                                                          tagCount(tagCount),
-                                                                          tagAlignment(tagAlignment) {
+                 size_t tagAlignment, size_t tagSize, bool doNotReleaseNodes) : rootDeviceIndex(rootDeviceIndex),
+                                                                                memoryManager(memMngr),
+                                                                                tagCount(tagCount),
+                                                                                tagAlignment(tagAlignment),
+                                                                                doNotReleaseNodes(doNotReleaseNodes) {

        this->tagSize = alignUp(tagSize, tagAlignment);
        populateFreeTags();
@@ -95,7 +105,7 @@ class TagAllocator {

    MOCKABLE_VIRTUAL void returnTag(NodeType *node) {
        if (node->refCount.fetch_sub(1) == 1) {
-            if (node->tagForCpuAccess->canBeReleased()) {
+            if (node->canBeReleased()) {
                returnTagToFreePool(node);
            } else {
                returnTagToDeferredPool(node);
@@ -115,6 +125,7 @@ class TagAllocator {
    size_t tagCount;
    size_t tagAlignment;
    size_t tagSize;
+    bool doNotReleaseNodes = false;

    std::mutex allocatorMutex;

@@ -150,6 +161,7 @@ class TagAllocator {
            nodesMemory[i].gfxAllocation = graphicsAllocation;
            nodesMemory[i].tagForCpuAccess = reinterpret_cast<TagType *>(Start);
            nodesMemory[i].gpuAddress = gpuBaseAddress + (i * tagSize);
+            nodesMemory[i].setDoNotReleaseNodes(doNotReleaseNodes);
            freeTags.pushTailOne(nodesMemory[i]);
            Start += tagSize;
        }
@@ -165,7 +177,7 @@ class TagAllocator {

        while (currentNode != nullptr) {
            auto nextNode = currentNode->next;
-            if (currentNode->tagForCpuAccess->canBeReleased()) {
+            if (currentNode->canBeReleased()) {
                pendingFreeTags.pushFrontOne(*currentNode);
            } else {
                pendingDeferredTags.pushFrontOne(*currentNode);