mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-26 15:03:02 +08:00
Revert TSP changes
This commit reverts: a1d2bdc76666059653c79fe39a26113ce47c632a, 71a115129c1698ff15305fd0ea3828cba861be47, e1a9087a466bfba54d84a64247e6596092034a91. Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
48feca4f44
commit
2e97aeccfd
@@ -316,7 +316,7 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
|
||||
auto &commandStreamCSR = this->getCS(getRequiredCmdStreamSizeAligned(dispatchFlags, device));
|
||||
auto commandStreamStartCSR = commandStreamCSR.getUsed();
|
||||
|
||||
TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer<GfxFamily>(commandStreamCSR, dispatchFlags.csrDependencies);
|
||||
TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer<GfxFamily>(commandStreamCSR, dispatchFlags.csrDependencies, getOsContext().getNumSupportedDevices());
|
||||
TimestampPacketHelper::programCsrDependenciesForForTaskCountContainer<GfxFamily>(commandStreamCSR, dispatchFlags.csrDependencies);
|
||||
|
||||
if (stallingPipeControlOnNextFlushRequired) {
|
||||
@@ -1011,7 +1011,7 @@ uint32_t CommandStreamReceiverHw<GfxFamily>::blitBuffer(const BlitPropertiesCont
|
||||
programEnginePrologue(commandStream);
|
||||
|
||||
for (auto &blitProperties : blitPropertiesContainer) {
|
||||
TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer<GfxFamily>(commandStream, blitProperties.csrDependencies);
|
||||
TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer<GfxFamily>(commandStream, blitProperties.csrDependencies, getOsContext().getNumSupportedDevices());
|
||||
TimestampPacketHelper::programCsrDependenciesForForTaskCountContainer<GfxFamily>(commandStream, blitProperties.csrDependencies);
|
||||
|
||||
if (blitProperties.outputTimestampPacket && profilingEnabled) {
|
||||
|
||||
@@ -265,6 +265,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, ForceHostPointerImport, -1, "-1: default, 0: dis
|
||||
DECLARE_DEBUG_VARIABLE(bool, UseMaxSimdSizeToDeduceMaxWorkgroupSize, false, "With this flag on, max workgroup size is deduced using SIMD32 instead of SIMD8, this causes the max wkg size to be 4 times bigger")
|
||||
DECLARE_DEBUG_VARIABLE(bool, ReturnRawGpuTimestamps, false, "Driver returns raw GPU tiemstamps instead of calculated ones.")
|
||||
DECLARE_DEBUG_VARIABLE(bool, ForcePerDssBackedBufferProgramming, false, "Always program per-DSS memory backed buffer in preamble")
|
||||
DECLARE_DEBUG_VARIABLE(bool, DisableAtomicForPostSyncs, false, "When enabled, post syncs are not tracked with atomics")
|
||||
DECLARE_DEBUG_VARIABLE(bool, UseCommandBufferHeaderSizeForWddmQueueSubmission, true, "0: Page size (4096), 1: sizeof(COMMAND_BUFFER_HEADER)")
|
||||
DECLARE_DEBUG_VARIABLE(bool, DisableDeepBind, false, "Disable passing RTLD_DEEPBIND flag to all dlopen calls.")
|
||||
DECLARE_DEBUG_VARIABLE(bool, UseUmKmDataTranslator, false, "Use helper library for UMD<->KMD (WDDM) struct layout compatibility")
|
||||
|
||||
@@ -102,6 +102,7 @@ set(NEO_CORE_HELPERS
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/timestamp_offsets.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/timestamp_packet.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/timestamp_packet.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}/timestamp_packet_extra.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/uint16_avx2.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/uint16_sse4.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/vec.h
|
||||
|
||||
@@ -31,7 +31,7 @@ void TimestampPacketContainer::resolveDependencies(bool clearAllDependencies) {
|
||||
std::vector<TagNodeBase *> pendingNodes;
|
||||
|
||||
for (auto node : timestampPacketNodes) {
|
||||
if (clearAllDependencies) {
|
||||
if (node->canBeReleased() || clearAllDependencies) {
|
||||
node->returnTag();
|
||||
} else {
|
||||
pendingNodes.push_back(node);
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
|
||||
#include "pipe_control_args.h"
|
||||
|
||||
#include <atomic>
|
||||
#include <cstdint>
|
||||
#include <vector>
|
||||
|
||||
@@ -48,6 +49,20 @@ class TimestampPackets : public TagTypeBase {
|
||||
|
||||
static constexpr size_t getSinglePacketSize() { return sizeof(Packet); }
|
||||
|
||||
bool isCompleted() const {
|
||||
if (DebugManager.flags.DisableAtomicForPostSyncs.get()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < packetsUsed; i++) {
|
||||
if (packets[i].contextEnd == 1) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void initialize() {
|
||||
for (auto &packet : packets) {
|
||||
packet.contextStart = 1u;
|
||||
@@ -56,6 +71,7 @@ class TimestampPackets : public TagTypeBase {
|
||||
packet.globalEnd = 1u;
|
||||
}
|
||||
packetsUsed = 1;
|
||||
implicitGpuDependenciesCount = 0;
|
||||
}
|
||||
|
||||
void assignDataToAllTimestamps(uint32_t packetIndex, void *source) {
|
||||
@@ -66,6 +82,7 @@ class TimestampPackets : public TagTypeBase {
|
||||
static constexpr size_t getContextStartOffset() { return offsetof(Packet, contextStart); }
|
||||
static constexpr size_t getContextEndOffset() { return offsetof(Packet, contextEnd); }
|
||||
static constexpr size_t getGlobalEndOffset() { return offsetof(Packet, globalEnd); }
|
||||
size_t getImplicitGpuDependenciesCountOffset() const { return ptrDiff(&implicitGpuDependenciesCount, this); }
|
||||
|
||||
uint64_t getContextStartValue(uint32_t packetIndex) const { return static_cast<uint64_t>(packets[packetIndex].contextStart); }
|
||||
uint64_t getGlobalStartValue(uint32_t packetIndex) const { return static_cast<uint64_t>(packets[packetIndex].globalStart); }
|
||||
@@ -75,13 +92,16 @@ class TimestampPackets : public TagTypeBase {
|
||||
void setPacketsUsed(uint32_t used) { packetsUsed = used; }
|
||||
uint32_t getPacketsUsed() const { return packetsUsed; }
|
||||
|
||||
uint32_t getImplicitGpuDependenciesCount() const { return implicitGpuDependenciesCount; }
|
||||
|
||||
protected:
|
||||
Packet packets[TimestampPacketSizeControl::preferredPacketCount];
|
||||
uint32_t implicitGpuDependenciesCount = 0;
|
||||
uint32_t packetsUsed = 1;
|
||||
};
|
||||
#pragma pack()
|
||||
|
||||
static_assert(((4 * TimestampPacketSizeControl::preferredPacketCount + 1) * sizeof(uint32_t)) == sizeof(TimestampPackets<uint32_t>),
|
||||
static_assert(((4 * TimestampPacketSizeControl::preferredPacketCount + 2) * sizeof(uint32_t)) == sizeof(TimestampPackets<uint32_t>),
|
||||
"This structure is consumed by GPU and has to follow specific restrictions for padding and size");
|
||||
|
||||
class TimestampPacketContainer : public NonCopyableClass {
|
||||
@@ -124,24 +144,49 @@ struct TimestampPacketHelper {
|
||||
return timestampPacketNode.getGpuAddress() + timestampPacketNode.getGlobalStartOffset();
|
||||
}
|
||||
|
||||
static uint64_t getGpuDependenciesCountGpuAddress(const TagNodeBase ×tampPacketNode) {
|
||||
return timestampPacketNode.getGpuAddress() + timestampPacketNode.getImplicitGpuDependenciesCountOffset();
|
||||
}
|
||||
|
||||
static void overrideSupportedDevicesCount(uint32_t &numSupportedDevices);
|
||||
|
||||
template <typename GfxFamily>
|
||||
static void programSemaphore(LinearStream &cmdStream, TagNodeBase ×tampPacketNode) {
|
||||
static void programSemaphoreWithImplicitDependency(LinearStream &cmdStream, TagNodeBase ×tampPacketNode, uint32_t numSupportedDevices) {
|
||||
using MI_ATOMIC = typename GfxFamily::MI_ATOMIC;
|
||||
using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION;
|
||||
using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT;
|
||||
|
||||
auto compareAddress = getContextEndGpuAddress(timestampPacketNode);
|
||||
auto dependenciesCountAddress = getGpuDependenciesCountGpuAddress(timestampPacketNode);
|
||||
|
||||
for (uint32_t packetId = 0; packetId < timestampPacketNode.getPacketsUsed(); packetId++) {
|
||||
uint64_t compareOffset = packetId * timestampPacketNode.getSinglePacketSize();
|
||||
EncodeSempahore<GfxFamily>::addMiSemaphoreWaitCommand(cmdStream, compareAddress + compareOffset, 1, COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD);
|
||||
}
|
||||
|
||||
bool trackPostSyncDependencies = true;
|
||||
if (DebugManager.flags.DisableAtomicForPostSyncs.get()) {
|
||||
trackPostSyncDependencies = false;
|
||||
}
|
||||
|
||||
if (trackPostSyncDependencies) {
|
||||
overrideSupportedDevicesCount(numSupportedDevices);
|
||||
|
||||
for (uint32_t i = 0; i < numSupportedDevices; i++) {
|
||||
timestampPacketNode.incImplicitCpuDependenciesCount();
|
||||
}
|
||||
EncodeAtomic<GfxFamily>::programMiAtomic(cmdStream, dependenciesCountAddress,
|
||||
MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT,
|
||||
MI_ATOMIC::DATA_SIZE::DATA_SIZE_DWORD,
|
||||
0u, 0u, 0x0u, 0x0u);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
static void programCsrDependenciesForTimestampPacketContainer(LinearStream &cmdStream, const CsrDependencies &csrDependencies) {
|
||||
static void programCsrDependenciesForTimestampPacketContainer(LinearStream &cmdStream, const CsrDependencies &csrDependencies, uint32_t numSupportedDevices) {
|
||||
for (auto timestampPacketContainer : csrDependencies.timestampPacketContainer) {
|
||||
for (auto &node : timestampPacketContainer->peekNodes()) {
|
||||
TimestampPacketHelper::programSemaphore<GfxFamily>(cmdStream, *node);
|
||||
TimestampPacketHelper::programSemaphoreWithImplicitDependency<GfxFamily>(cmdStream, *node, numSupportedDevices);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -162,9 +207,9 @@ struct TimestampPacketHelper {
|
||||
}
|
||||
|
||||
template <typename GfxFamily, AuxTranslationDirection auxTranslationDirection>
|
||||
static void programSemaphoreForAuxTranslation(LinearStream &cmdStream,
|
||||
const TimestampPacketDependencies *timestampPacketDependencies,
|
||||
const HardwareInfo &hwInfo) {
|
||||
static void programSemaphoreWithImplicitDependencyForAuxTranslation(LinearStream &cmdStream,
|
||||
const TimestampPacketDependencies *timestampPacketDependencies,
|
||||
const HardwareInfo &hwInfo, uint32_t numSupportedDevices) {
|
||||
auto &container = (auxTranslationDirection == AuxTranslationDirection::AuxToNonAux)
|
||||
? timestampPacketDependencies->auxToNonAuxNodes
|
||||
: timestampPacketDependencies->nonAuxToAuxNodes;
|
||||
@@ -181,7 +226,7 @@ struct TimestampPacketHelper {
|
||||
}
|
||||
|
||||
for (auto &node : container.peekNodes()) {
|
||||
TimestampPacketHelper::programSemaphore<GfxFamily>(cmdStream, *node);
|
||||
TimestampPacketHelper::programSemaphoreWithImplicitDependency<GfxFamily>(cmdStream, *node, numSupportedDevices);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -198,12 +243,14 @@ struct TimestampPacketHelper {
|
||||
|
||||
template <typename GfxFamily>
|
||||
static size_t getRequiredCmdStreamSizeForNodeDependencyWithBlitEnqueue() {
|
||||
return sizeof(typename GfxFamily::MI_SEMAPHORE_WAIT);
|
||||
return sizeof(typename GfxFamily::MI_SEMAPHORE_WAIT) + sizeof(typename GfxFamily::MI_ATOMIC);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
static size_t getRequiredCmdStreamSizeForNodeDependency(TagNodeBase ×tampPacketNode) {
|
||||
return (timestampPacketNode.getPacketsUsed() * sizeof(typename GfxFamily::MI_SEMAPHORE_WAIT));
|
||||
size_t totalMiSemaphoreWaitSize = timestampPacketNode.getPacketsUsed() * sizeof(typename GfxFamily::MI_SEMAPHORE_WAIT);
|
||||
|
||||
return totalMiSemaphoreWaitSize + sizeof(typename GfxFamily::MI_ATOMIC);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
|
||||
13
shared/source/helpers/timestamp_packet_extra.cpp
Normal file
13
shared/source/helpers/timestamp_packet_extra.cpp
Normal file
@@ -0,0 +1,13 @@
|
||||
/*
|
||||
* Copyright (C) 2020-2021 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#include "shared/source/helpers/timestamp_packet.h"
|
||||
|
||||
namespace NEO {
|
||||
void TimestampPacketHelper::overrideSupportedDevicesCount(uint32_t &numSupportedDevices) {
|
||||
}
|
||||
} // namespace NEO
|
||||
@@ -40,7 +40,9 @@ void TagNodeBase::returnTag() {
|
||||
}
|
||||
|
||||
bool TagNodeBase::canBeReleased() const {
|
||||
return !doNotReleaseNodes;
|
||||
return (!doNotReleaseNodes) &&
|
||||
(isCompleted()) &&
|
||||
(getImplicitGpuDependenciesCount() == getImplicitCpuDependenciesCount());
|
||||
}
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
@@ -54,15 +54,21 @@ class TagNodeBase : public NonCopyableOrMovableClass {
|
||||
|
||||
bool isProfilingCapable() const { return profilingCapable; }
|
||||
|
||||
void incImplicitCpuDependenciesCount() { implicitCpuDependenciesCount++; }
|
||||
|
||||
uint32_t getImplicitCpuDependenciesCount() const { return implicitCpuDependenciesCount.load(); }
|
||||
|
||||
const TagAllocatorBase *getAllocator() const { return allocator; }
|
||||
|
||||
// TagType specific calls
|
||||
virtual bool isCompleted() const = 0;
|
||||
virtual void assignDataToAllTimestamps(uint32_t packetIndex, void *source) = 0;
|
||||
|
||||
virtual size_t getGlobalStartOffset() const = 0;
|
||||
virtual size_t getContextStartOffset() const = 0;
|
||||
virtual size_t getContextEndOffset() const = 0;
|
||||
virtual size_t getGlobalEndOffset() const = 0;
|
||||
virtual size_t getImplicitGpuDependenciesCountOffset() const = 0;
|
||||
|
||||
virtual uint64_t getContextStartValue(uint32_t packetIndex) const = 0;
|
||||
virtual uint64_t getGlobalStartValue(uint32_t packetIndex) const = 0;
|
||||
@@ -77,6 +83,8 @@ class TagNodeBase : public NonCopyableOrMovableClass {
|
||||
|
||||
virtual size_t getSinglePacketSize() const = 0;
|
||||
|
||||
virtual uint32_t getImplicitGpuDependenciesCount() const = 0;
|
||||
|
||||
virtual MetricsLibraryApi::QueryHandle_1_0 &getQueryHandleRef() const = 0;
|
||||
|
||||
protected:
|
||||
@@ -87,6 +95,7 @@ class TagNodeBase : public NonCopyableOrMovableClass {
|
||||
MultiGraphicsAllocation *gfxAllocation = nullptr;
|
||||
uint64_t gpuAddress = 0;
|
||||
std::atomic<uint32_t> refCount{0};
|
||||
std::atomic<uint32_t> implicitCpuDependenciesCount{0};
|
||||
bool doNotReleaseNodes = false;
|
||||
bool profilingCapable = true;
|
||||
|
||||
@@ -104,6 +113,7 @@ class TagNode : public TagNodeBase, public IDNode<TagNode<TagType>> {
|
||||
|
||||
void initialize() override {
|
||||
tagForCpuAccess->initialize();
|
||||
implicitCpuDependenciesCount.store(0);
|
||||
setProfilingCapable(true);
|
||||
}
|
||||
|
||||
@@ -111,10 +121,13 @@ class TagNode : public TagNodeBase, public IDNode<TagNode<TagType>> {
|
||||
|
||||
void assignDataToAllTimestamps(uint32_t packetIndex, void *source) override;
|
||||
|
||||
bool isCompleted() const override;
|
||||
|
||||
size_t getGlobalStartOffset() const override;
|
||||
size_t getContextStartOffset() const override;
|
||||
size_t getContextEndOffset() const override;
|
||||
size_t getGlobalEndOffset() const override;
|
||||
size_t getImplicitGpuDependenciesCountOffset() const override;
|
||||
|
||||
uint64_t getContextStartValue(uint32_t packetIndex) const override;
|
||||
uint64_t getGlobalStartValue(uint32_t packetIndex) const override;
|
||||
@@ -129,6 +142,8 @@ class TagNode : public TagNodeBase, public IDNode<TagNode<TagType>> {
|
||||
|
||||
size_t getSinglePacketSize() const override;
|
||||
|
||||
uint32_t getImplicitGpuDependenciesCount() const override;
|
||||
|
||||
MetricsLibraryApi::QueryHandle_1_0 &getQueryHandleRef() const override;
|
||||
};
|
||||
|
||||
|
||||
@@ -164,6 +164,15 @@ size_t TagNode<TagType>::getGlobalEndOffset() const {
|
||||
}
|
||||
}
|
||||
|
||||
template <typename TagType>
|
||||
size_t TagNode<TagType>::getImplicitGpuDependenciesCountOffset() const {
|
||||
if constexpr (TagType::getTagNodeType() == TagNodeType::TimestampPacket) {
|
||||
return tagForCpuAccess->getImplicitGpuDependenciesCountOffset();
|
||||
} else {
|
||||
UNRECOVERABLE_IF(true);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename TagType>
|
||||
uint64_t TagNode<TagType>::getContextStartValue(uint32_t packetIndex) const {
|
||||
if constexpr (TagType::getTagNodeType() != TagNodeType::HwPerfCounter) {
|
||||
@@ -241,6 +250,15 @@ uint32_t TagNode<TagType>::getPacketsUsed() const {
|
||||
}
|
||||
}
|
||||
|
||||
template <typename TagType>
|
||||
uint32_t TagNode<TagType>::getImplicitGpuDependenciesCount() const {
|
||||
if constexpr (TagType::getTagNodeType() == TagNodeType::TimestampPacket) {
|
||||
return tagForCpuAccess->getImplicitGpuDependenciesCount();
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename TagType>
|
||||
size_t TagNode<TagType>::getSinglePacketSize() const {
|
||||
if constexpr (TagType::getTagNodeType() == TagNodeType::TimestampPacket) {
|
||||
@@ -261,6 +279,15 @@ void TagNode<TagType>::assignDataToAllTimestamps(uint32_t packetIndex, void *sou
|
||||
}
|
||||
}
|
||||
|
||||
template <typename TagType>
|
||||
bool TagNode<TagType>::isCompleted() const {
|
||||
if constexpr (TagType::getTagNodeType() == TagNodeType::TimestampPacket) {
|
||||
return tagForCpuAccess->isCompleted();
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename TagType>
|
||||
MetricsLibraryApi::QueryHandle_1_0 &TagNode<TagType>::getQueryHandleRef() const {
|
||||
if constexpr (TagType::getTagNodeType() == TagNodeType::HwPerfCounter) {
|
||||
|
||||
Reference in New Issue
Block a user