Introduce barrier tracking mechanism

Related-To: NEO-7696

Signed-off-by: Lukasz Jobczyk <lukasz.jobczyk@intel.com>
This commit is contained in:
Lukasz Jobczyk
2023-02-06 10:12:34 +00:00
committed by Compute-Runtime-Automation
parent 6f3503af38
commit 9f574b6fba
6 changed files with 114 additions and 4 deletions

View File

@@ -2036,6 +2036,14 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendWaitOnEvents(uint32_t nu
}
}
if (this->cmdListType == TYPE_IMMEDIATE && isCopyOnly()) {
NEO::MiFlushArgs args;
args.commandWithPostSync = true;
const auto &productHelper = this->device->getProductHelper();
NEO::EncodeMiFlushDW<GfxFamily>::programMiFlushDw(*commandContainer.getCommandStream(), this->csr->getBarrierCountGpuAddress(), this->csr->getNextBarrierCount() + 1, args, productHelper);
commandContainer.addToResidencyContainer(this->csr->getTagAllocation());
}
if (NEO::DebugManager.flags.EnableSWTags.get()) {
neoDevice->getRootDeviceEnvironment().tagsManager->insertTag<GfxFamily, NEO::SWTags::CallNameEndTag>(
*commandContainer.getCommandStream(),
@@ -2563,8 +2571,18 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendBarrier(ze_event_handle_
if (isCopyOnly()) {
NEO::MiFlushArgs args;
uint64_t gpuAddress = 0u;
TaskCountType value = 0u;
if (this->cmdListType == TYPE_IMMEDIATE) {
args.commandWithPostSync = true;
gpuAddress = this->csr->getBarrierCountGpuAddress();
value = this->csr->getNextBarrierCount() + 1;
commandContainer.addToResidencyContainer(this->csr->getTagAllocation());
}
const auto &productHelper = this->device->getProductHelper();
NEO::EncodeMiFlushDW<GfxFamily>::programMiFlushDw(*commandContainer.getCommandStream(), 0, 0, args, productHelper);
NEO::EncodeMiFlushDW<GfxFamily>::programMiFlushDw(*commandContainer.getCommandStream(), gpuAddress, value, args, productHelper);
} else {
appendComputeBarrierCommand();
}

View File

@@ -6,6 +6,7 @@
*/
#include "shared/source/built_ins/sip.h"
#include "shared/source/command_container/command_encoder.h"
#include "shared/source/command_container/encode_surface_state.h"
#include "shared/source/helpers/gfx_core_helper.h"
#include "shared/test/common/cmd_parse/gen_cmd_parse.h"
@@ -704,6 +705,71 @@ HWTEST_F(CommandListImmediateFlushTaskComputeTests, givenUseCsrImmediateSubmissi
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
}
HWTEST2_F(CommandListCreate, givenImmediateCopyOnlyCmdListWhenAppendBarrierThenIncrementBarrierCountAndDispatchBarrierTagUpdate, IsAtLeastSkl) {
using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW;
ze_command_queue_desc_t queueDesc = {};
ze_result_t returnValue = ZE_RESULT_SUCCESS;
std::unique_ptr<L0::CommandList> commandList(CommandList::createImmediate(productFamily, device, &queueDesc, false, NEO::EngineGroupType::Copy, returnValue));
EXPECT_EQ(commandList->csr->getNextBarrierCount(), 0u);
auto result = commandList->appendBarrier(nullptr, 0, nullptr);
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(commandList->csr->getNextBarrierCount(), 2u);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList, ptrOffset(commandList->commandContainer.getCommandStream()->getCpuBase(), 0), commandList->commandContainer.getCommandStream()->getUsed()));
auto itor = find<MI_FLUSH_DW *>(cmdList.begin(), cmdList.end());
if (EncodeMiFlushDW<FamilyType>::getMiFlushDwWaSize()) {
itor++;
}
EXPECT_NE(cmdList.end(), itor);
auto cmd = genCmdCast<MI_FLUSH_DW *>(*itor);
EXPECT_EQ(cmd->getPostSyncOperation(), MI_FLUSH_DW::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA_QWORD);
EXPECT_EQ(cmd->getDestinationAddress(), commandList->csr->getBarrierCountGpuAddress());
EXPECT_EQ(cmd->getImmediateData(), 2u);
}
HWTEST2_F(CommandListCreate, givenImmediateCopyOnlyCmdListWhenAppendWaitOnEventsThenIncrementBarrierCountAndDispatchBarrierTagUpdate, IsAtLeastSkl) {
using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW;
ze_command_queue_desc_t queueDesc = {};
ze_result_t returnValue = ZE_RESULT_SUCCESS;
std::unique_ptr<L0::CommandList> commandList(CommandList::createImmediate(productFamily, device, &queueDesc, false, NEO::EngineGroupType::Copy, returnValue));
EXPECT_EQ(commandList->csr->getNextBarrierCount(), 0u);
ze_event_pool_desc_t eventPoolDesc = {};
eventPoolDesc.count = 1;
ze_event_desc_t eventDesc = {};
eventDesc.index = 0;
eventDesc.signal = ZE_EVENT_SCOPE_FLAG_DEVICE;
ze_result_t result = ZE_RESULT_SUCCESS;
auto eventPool = std::unique_ptr<EventPool>(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result));
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto event = std::unique_ptr<Event>(Event::create<typename FamilyType::TimestampPacketType>(eventPool.get(), &eventDesc, device));
auto eventHandle = event->toHandle();
result = commandList->appendWaitOnEvents(1u, &eventHandle, false);
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(commandList->csr->getNextBarrierCount(), 2u);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList, ptrOffset(commandList->commandContainer.getCommandStream()->getCpuBase(), 0), commandList->commandContainer.getCommandStream()->getUsed()));
auto itor = find<MI_FLUSH_DW *>(cmdList.begin(), cmdList.end());
if (EncodeMiFlushDW<FamilyType>::getMiFlushDwWaSize()) {
itor++;
}
EXPECT_NE(cmdList.end(), itor);
auto cmd = genCmdCast<MI_FLUSH_DW *>(*itor);
EXPECT_EQ(cmd->getPostSyncOperation(), MI_FLUSH_DW::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA_QWORD);
EXPECT_EQ(cmd->getDestinationAddress(), commandList->csr->getBarrierCountGpuAddress());
EXPECT_EQ(cmd->getImmediateData(), 2u);
}
HWTEST_F(CommandListImmediateFlushTaskComputeTests, givenUseCsrImmediateSubmissionDisabledForImmediateWhenAppendBarrierWithEventThenSuccessIsReturned) {
NEO::DebugManager.flags.EnableFlushTaskSubmission.set(0);

View File

@@ -743,6 +743,8 @@ bool CommandStreamReceiver::initializeTagAllocation() {
userPauseConfirmation = Thread::create(CommandStreamReceiver::asyncDebugBreakConfirmation, reinterpret_cast<void *>(this));
}
this->barrierCountTagAddress = ptrOffset(this->tagAddress, TagAllocationLayout::barrierCountOffset);
return true;
}
@@ -1009,7 +1011,7 @@ TaskCountType CompletionStamp::getTaskCountFromSubmissionStatusError(SubmissionS
return 0;
}
}
uint64_t CommandStreamReceiver::getBarrierCountGpuAddress() const { return ptrOffset(this->tagAllocation->getGpuAddress(), TagAllocationLayout::barrierCountOffset); }
uint64_t CommandStreamReceiver::getDebugPauseStateGPUAddress() const { return tagAllocation->getGpuAddress() + TagAllocationLayout::debugPauseStateAddressOffset; }
uint64_t CommandStreamReceiver::getCompletionAddress() const {
uint64_t completionFenceAddress = castToUint64(const_cast<TagAddressType *>(tagAddress));

View File

@@ -132,7 +132,11 @@ class CommandStreamReceiver {
return tagsMultiAllocation;
}
MultiGraphicsAllocation &createTagsMultiAllocation();
TaskCountType getNextBarrierCount() { return this->barrierCount.fetch_add(1u); }
volatile TagAddressType *getTagAddress() const { return tagAddress; }
volatile TagAddressType *getBarrierCountTagAddress() const { return this->barrierCountTagAddress; }
uint64_t getBarrierCountGpuAddress() const;
uint64_t getDebugPauseStateGPUAddress() const;
virtual bool waitForFlushStamp(FlushStamp &flushStampToWait) { return true; }
@@ -424,6 +428,7 @@ class CommandStreamReceiver {
uint64_t totalMemoryUsed = 0u;
volatile TagAddressType *tagAddress = nullptr;
volatile TagAddressType *barrierCountTagAddress = nullptr;
volatile DebugPauseState *debugPauseStateAddress = nullptr;
SpinLock debugPauseStateLock;
static void *asyncDebugBreakConfirmation(void *arg);
@@ -445,6 +450,7 @@ class CommandStreamReceiver {
OsContext *osContext = nullptr;
TaskCountType *completionFenceValuePointer = nullptr;
std::atomic<TaskCountType> barrierCount{0};
// current taskLevel. Used for determining if a PIPE_CONTROL is needed.
std::atomic<TaskCountType> taskLevel{0};
std::atomic<TaskCountType> latestSentTaskCount{0};

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2022 Intel Corporation
* Copyright (C) 2022-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -12,6 +12,6 @@ namespace NEO {
namespace TagAllocationLayout {
inline constexpr uint64_t debugPauseStateAddressOffset = MemoryConstants::kiloByte;
inline constexpr uint64_t completionFenceOffset = 2 * MemoryConstants::kiloByte;
inline constexpr uint64_t barrierCountOffset = 3 * MemoryConstants::kiloByte;
} // namespace TagAllocationLayout
} // namespace NEO

View File

@@ -1454,6 +1454,24 @@ TEST(CommandStreamReceiverSimpleTest, givenPrintfTagAllocationAddressFlagEnabled
EXPECT_TRUE(hasSubstr(output, std::string(expectedStr)));
}
TEST(CommandStreamReceiverSimpleTest, whenInitializeTagAllocationThenBarrierCountAddressAreSet) {
DeviceBitfield deviceBitfield(1);
auto osContext = std::unique_ptr<OsContext>(OsContext::create(nullptr, 0, 0,
EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS, EngineUsage::Regular})));
MockExecutionEnvironment executionEnvironment;
executionEnvironment.prepareRootDeviceEnvironments(1);
executionEnvironment.initializeMemoryManager();
MockCommandStreamReceiver csr(executionEnvironment, 0, deviceBitfield);
csr.setupContext(*osContext);
csr.initializeTagAllocation();
EXPECT_EQ(csr.getBarrierCountTagAddress(), ptrOffset(csr.getTagAddress(), TagAllocationLayout::barrierCountOffset));
EXPECT_EQ(csr.getBarrierCountGpuAddress(), ptrOffset(csr.getTagAllocation()->getGpuAddress(), TagAllocationLayout::barrierCountOffset));
}
TEST(CommandStreamReceiverSimpleTest, givenGpuIdleImplicitFlushCheckDisabledWhenGpuIsIdleThenReturnFalse) {
MockExecutionEnvironment executionEnvironment;
executionEnvironment.prepareRootDeviceEnvironments(1);