feature: initial support for patching regular in-order CmdList

Related-To: NEO-7966

Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com>
This commit is contained in:
Dunajski, Bartosz 2023-09-20 10:47:56 +00:00 committed by Compute-Runtime-Automation
parent 97e7cda912
commit 42496ac96d
10 changed files with 218 additions and 10 deletions

View File

@ -169,7 +169,7 @@ struct CommandListCoreFamily : CommandListImp {
ze_result_t appendSignalEvent(ze_event_handle_t hEvent) override;
ze_result_t appendWaitOnEvents(uint32_t numEvents, ze_event_handle_t *phEvent, bool relaxedOrderingAllowed, bool trackDependencies, bool signalInOrderCompletion) override;
void appendWaitOnInOrderDependency(NEO::GraphicsAllocation *dependencyCounterAllocation, uint64_t waitValue, uint32_t offset, bool relaxedOrderingAllowed);
void appendWaitOnInOrderDependency(NEO::GraphicsAllocation *dependencyCounterAllocation, uint64_t waitValue, uint32_t offset, bool relaxedOrderingAllowed, bool implicitDependency);
void appendSignalInOrderDependencyCounter();
ze_result_t appendWriteGlobalTimestamp(uint64_t *dstptr, ze_event_handle_t hSignalEvent,
uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override;
@ -186,6 +186,7 @@ struct CommandListCoreFamily : CommandListImp {
ze_result_t executeCommandListImmediate(bool performMigration) override;
ze_result_t executeCommandListImmediateImpl(bool performMigration, L0::CommandQueue *cmdQImmediate);
size_t getReserveSshSize();
void patchInOrderCmds() override;
protected:
MOCKABLE_VIRTUAL ze_result_t appendMemoryCopyKernelWithGA(void *dstPtr, NEO::GraphicsAllocation *dstPtrAlloc,
@ -330,6 +331,10 @@ struct CommandListCoreFamily : CommandListImp {
void handleInOrderImplicitDependencies(bool relaxedOrderingAllowed);
virtual void handleInOrderDependencyCounter();
bool isQwordInOrderCounter() const { return GfxFamily::isQwordInOrderCounter; }
void addCmdForPatching(void *cmd, uint64_t counterValue, InOrderPatchCommandTypes::CmdType cmdType);
InOrderPatchCommandsContainer<GfxFamily> inOrderPatchCmds;
};
template <PRODUCT_FAMILY gfxProductFamily>

View File

@ -151,7 +151,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::reset() {
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::handleInOrderDependencyCounter() {
if (!isQwordInOrderCounter() && ((inOrderDependencyCounter + 1) == std::numeric_limits<uint32_t>::max())) {
CommandListCoreFamily<gfxCoreFamily>::appendWaitOnInOrderDependency(inOrderDependencyCounterAllocation, inOrderDependencyCounter + 1, inOrderAllocationOffset, false);
CommandListCoreFamily<gfxCoreFamily>::appendWaitOnInOrderDependency(inOrderDependencyCounterAllocation, inOrderDependencyCounter + 1, inOrderAllocationOffset, false, true);
inOrderDependencyCounter = 0;
@ -2226,7 +2226,7 @@ void CommandListCoreFamily<gfxCoreFamily>::handleInOrderImplicitDependencies(boo
NEO::RelaxedOrderingHelper::encodeRegistersBeforeDependencyCheckers<GfxFamily>(*commandContainer.getCommandStream());
}
CommandListCoreFamily<gfxCoreFamily>::appendWaitOnInOrderDependency(this->inOrderDependencyCounterAllocation, this->inOrderDependencyCounter, this->inOrderAllocationOffset, relaxedOrderingAllowed);
CommandListCoreFamily<gfxCoreFamily>::appendWaitOnInOrderDependency(this->inOrderDependencyCounterAllocation, this->inOrderDependencyCounter, this->inOrderAllocationOffset, relaxedOrderingAllowed, true);
}
}
@ -2293,7 +2293,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendSignalEvent(ze_event_han
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::appendWaitOnInOrderDependency(NEO::GraphicsAllocation *dependencyCounterAllocation, uint64_t waitValue, uint32_t offset, bool relaxedOrderingAllowed) {
void CommandListCoreFamily<gfxCoreFamily>::appendWaitOnInOrderDependency(NEO::GraphicsAllocation *dependencyCounterAllocation, uint64_t waitValue, uint32_t offset, bool relaxedOrderingAllowed, bool implicitDependency) {
using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION;
UNRECOVERABLE_IF(waitValue > std::numeric_limits<uint32_t>::max());
@ -2307,9 +2307,16 @@ void CommandListCoreFamily<gfxCoreFamily>::appendWaitOnInOrderDependency(NEO::Gr
NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::programConditionalDataMemBatchBufferStart(*commandContainer.getCommandStream(), 0, gpuAddress, waitValue, NEO::CompareOperation::Less, true, isQwordInOrderCounter());
} else {
NEO::EncodeSemaphore<GfxFamily>::addMiSemaphoreWaitCommand(*commandContainer.getCommandStream(),
gpuAddress, waitValue,
COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD, false, isQwordInOrderCounter(), false);
using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT;
auto semaphoreCommand = reinterpret_cast<MI_SEMAPHORE_WAIT *>(commandContainer.getCommandStream()->getSpace(sizeof(MI_SEMAPHORE_WAIT)));
NEO::EncodeSemaphore<GfxFamily>::programMiSemaphoreWait(semaphoreCommand, gpuAddress, waitValue, COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD,
false, true, isQwordInOrderCounter(), false);
if (implicitDependency) {
addCmdForPatching(semaphoreCommand, waitValue, InOrderPatchCommandTypes::CmdType::Semaphore);
}
}
gpuAddress += sizeof(uint64_t);
@ -2376,7 +2383,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendWaitOnEvents(uint32_t nu
return ZE_RESULT_ERROR_INVALID_ARGUMENT; // in-order event not signaled yet
}
if (isInOrderEventWaitRequired(*event)) {
CommandListCoreFamily<gfxCoreFamily>::appendWaitOnInOrderDependency(event->getInOrderExecDataAllocation(), event->getInOrderExecSignalValue(), event->getInOrderAllocationOffset(), relaxedOrderingAllowed);
CommandListCoreFamily<gfxCoreFamily>::appendWaitOnInOrderDependency(event->getInOrderExecDataAllocation(), event->getInOrderExecSignalValue(), event->getInOrderAllocationOffset(), relaxedOrderingAllowed, false);
}
continue;
}
@ -2413,12 +2420,18 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendWaitOnEvents(uint32_t nu
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::appendSignalInOrderDependencyCounter() {
using MI_STORE_DATA_IMM = typename GfxFamily::MI_STORE_DATA_IMM;
uint64_t signalValue = this->inOrderDependencyCounter + 1;
uint64_t gpuVa = this->inOrderDependencyCounterAllocation->getGpuAddress() + this->inOrderAllocationOffset;
NEO::EncodeStoreMemory<GfxFamily>::programStoreDataImm(*commandContainer.getCommandStream(), gpuVa,
getLowPart(signalValue), getHighPart(signalValue), isQwordInOrderCounter(), (this->partitionCount > 1));
auto miStoreCmd = reinterpret_cast<MI_STORE_DATA_IMM *>(commandContainer.getCommandStream()->getSpace(sizeof(MI_STORE_DATA_IMM)));
NEO::EncodeStoreMemory<GfxFamily>::programStoreDataImm(miStoreCmd, gpuVa, getLowPart(signalValue), getHighPart(signalValue),
isQwordInOrderCounter(), (this->partitionCount > 1));
addCmdForPatching(miStoreCmd, signalValue, InOrderPatchCommandTypes::CmdType::Sdi);
if (NEO::EncodeUserInterruptHelper::isOperationAllowed(NEO::EncodeUserInterruptHelper::onSignalingFenceMask)) {
NEO::EnodeUserInterrupt<GfxFamily>::encode(*commandContainer.getCommandStream());
@ -3439,4 +3452,21 @@ void CommandListCoreFamily<gfxCoreFamily>::appendWaitOnSingleEvent(Event *event,
}
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::addCmdForPatching(void *cmd, uint64_t counterValue, InOrderPatchCommandTypes::CmdType cmdType) {
if ((NEO::DebugManager.flags.EnableInOrderRegularCmdListPatching.get() == 1) && (this->cmdListType == TYPE_REGULAR)) {
this->inOrderPatchCmds.emplace_back(cmd, counterValue, cmdType);
}
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::patchInOrderCmds() {
if (this->regularCmdListSubmissionCounter > 0) {
auto appendCounter = this->regularCmdListSubmissionCounter * inOrderDependencyCounter;
for (auto &cmd : inOrderPatchCmds) {
cmd.patch(appendCounter);
}
}
}
} // namespace L0

View File

@ -9,6 +9,7 @@
#include "shared/source/os_interface/os_time.h"
#include "level_zero/core/source/cmdlist/cmdlist.h"
#include "level_zero/core/source/helpers/in_order_patch_cmds.h"
#include <memory>
@ -35,6 +36,7 @@ struct CommandListImp : CommandList {
void addToMappedEventList(Event *event);
const std::vector<Event *> &peekMappedEventList() { return mappedTsEventList; }
void incRegularCmdListSubmissionCounter() { regularCmdListSubmissionCounter++; }
virtual void patchInOrderCmds() = 0;
protected:
NEO::GraphicsAllocation *inOrderDependencyCounterAllocation = nullptr;

View File

@ -540,6 +540,7 @@ void CommandQueueHw<gfxCoreFamily>::setupCmdListsAndContextParams(
auto commandList = static_cast<CommandListImp *>(CommandList::fromHandle(phCommandLists[i]));
commandList->setCsr(this->csr);
commandList->storeReferenceTsToMappedEvents(false);
commandList->patchInOrderCmds();
commandList->incRegularCmdListSubmissionCounter();
auto &commandContainer = commandList->getCmdContainer();

View File

@ -11,6 +11,7 @@ target_sources(${L0_STATIC_LIB_NAME}
${CMAKE_CURRENT_SOURCE_DIR}/error_code_helper_l0.cpp
${CMAKE_CURRENT_SOURCE_DIR}/error_code_helper_l0.h
${CMAKE_CURRENT_SOURCE_DIR}/implicit_scaling_l0.cpp
${CMAKE_CURRENT_SOURCE_DIR}/in_order_patch_cmds.h
${CMAKE_CURRENT_SOURCE_DIR}/l0_gfx_core_helper_factory_init.inl
${CMAKE_CURRENT_SOURCE_DIR}/l0_populate_factory.h
${CMAKE_CURRENT_SOURCE_DIR}/properties_parser.h

View File

@ -0,0 +1,59 @@
/*
* Copyright (C) 2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#pragma once
#include "shared/source/helpers/ptr_math.h"
#include <cstdint>
#include <vector>
namespace L0 {
namespace InOrderPatchCommandTypes {
enum class CmdType {
Sdi,
Semaphore
};
template <typename GfxFamily>
struct BaseCmd {
BaseCmd(void *cmd, uint64_t baseCounterValue, CmdType cmdType) : cmd(cmd), baseCounterValue(baseCounterValue), cmdType(cmdType) {}
void patch(uint64_t appendCunterValue) {
if (CmdType::Sdi == cmdType) {
patchSdi(appendCunterValue);
} else {
UNRECOVERABLE_IF(CmdType::Semaphore != cmdType);
patchSemaphore(appendCunterValue);
}
}
void *cmd = nullptr;
const uint64_t baseCounterValue = 0;
const CmdType cmdType;
protected:
void patchSdi(uint64_t appendCunterValue) {
auto sdiCmd = reinterpret_cast<typename GfxFamily::MI_STORE_DATA_IMM *>(cmd);
sdiCmd->setDataDword0(getLowPart(baseCounterValue + appendCunterValue));
sdiCmd->setDataDword1(getHighPart(baseCounterValue + appendCunterValue));
}
void patchSemaphore(uint64_t appendCunterValue) {
auto semaphoreCmd = reinterpret_cast<typename GfxFamily::MI_SEMAPHORE_WAIT *>(cmd);
semaphoreCmd->setSemaphoreDataDword(static_cast<uint32_t>(baseCounterValue + appendCunterValue));
}
BaseCmd() = delete;
};
} // namespace InOrderPatchCommandTypes
template <typename GfxFamily>
using InOrderPatchCommandsContainer = std::vector<InOrderPatchCommandTypes::BaseCmd<GfxFamily>>;
} // namespace L0

View File

@ -79,6 +79,7 @@ struct WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>
using BaseClass::inOrderAllocationOffset;
using BaseClass::inOrderDependencyCounter;
using BaseClass::inOrderDependencyCounterAllocation;
using BaseClass::inOrderPatchCmds;
using BaseClass::isFlushTaskSubmissionEnabled;
using BaseClass::isQwordInOrderCounter;
using BaseClass::isRelaxedOrderingDispatchAllowed;
@ -172,6 +173,7 @@ struct WhiteBox<L0::CommandListCoreFamilyImmediate<gfxCoreFamily>>
using BaseClass::immediateCmdListHeapSharing;
using BaseClass::inOrderDependencyCounter;
using BaseClass::inOrderDependencyCounterAllocation;
using BaseClass::inOrderPatchCmds;
using BaseClass::isBcsSplitNeeded;
using BaseClass::isFlushTaskSubmissionEnabled;
using BaseClass::isQwordInOrderCounter;
@ -269,6 +271,7 @@ struct MockCommandList : public CommandList {
ADDMETHOD_NOBASE(close, ze_result_t, ZE_RESULT_SUCCESS, ());
ADDMETHOD_NOBASE(destroy, ze_result_t, ZE_RESULT_SUCCESS, ());
ADDMETHOD_NOBASE_VOIDRETURN(patchInOrderCmds, (void));
ADDMETHOD_NOBASE(appendLaunchKernel, ze_result_t, ZE_RESULT_SUCCESS,
(ze_kernel_handle_t kernelHandle,

View File

@ -2737,6 +2737,18 @@ HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenDoingCpuCopyThenSynchronize,
context->freeMem(deviceAlloc);
}
HWTEST2_F(InOrderCmdListTests, givenDebugFlagSetWhenUsingImmediateCmdListThenDontAddCmdsToPatch, IsAtLeastXeHpCore) {
DebugManager.flags.EnableInOrderRegularCmdListPatching.set(1);
auto immCmdList = createCopyOnlyImmCmdList<gfxCoreFamily>();
uint32_t copyData = 0;
immCmdList->appendMemoryCopy(&copyData, &copyData, 1, nullptr, 0, nullptr, false, false);
EXPECT_EQ(0u, immCmdList->inOrderPatchCmds.size());
}
HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenGpuHangDetectedInCpuCopyPathThenReportError, IsAtLeastXeHpCore) {
auto immCmdList = createImmCmdList<gfxCoreFamily>();
immCmdList->copyThroughLockedPtrEnabled = true;
@ -3216,6 +3228,99 @@ HWTEST2_F(InOrderRegularCmdListTests, givenInOrderFlagWhenCreatingCmdListThenEna
EXPECT_EQ(ZE_RESULT_SUCCESS, zeCommandListDestroy(cmdList));
}
HWTEST2_F(InOrderRegularCmdListTests, givenDebugFlagSetWhenUsingRegularCmdListThenAddCmdsToPatch, IsAtLeastXeHpCore) {
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
DebugManager.flags.EnableInOrderRegularCmdListPatching.set(1);
ze_command_queue_desc_t desc = {};
auto mockCmdQHw = makeZeUniquePtr<MockCommandQueueHw<gfxCoreFamily>>(device, device->getNEODevice()->getDefaultEngine().commandStreamReceiver, &desc);
mockCmdQHw->initialize(true, false, false);
auto regularCmdList = createRegularCmdList<gfxCoreFamily>(true);
auto cmdStream = regularCmdList->getCmdContainer().getCommandStream();
size_t offset = cmdStream->getUsed();
uint32_t copyData = 0;
regularCmdList->appendMemoryCopy(&copyData, &copyData, 1, nullptr, 0, nullptr, false, false);
EXPECT_EQ(1u, regularCmdList->inOrderPatchCmds.size()); // SDI
auto sdiFromContainer1 = reinterpret_cast<MI_STORE_DATA_IMM *>(regularCmdList->inOrderPatchCmds[0].cmd);
MI_STORE_DATA_IMM *sdiFromParser1 = nullptr;
{
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList,
ptrOffset(cmdStream->getCpuBase(), offset),
(cmdStream->getUsed() - offset)));
auto itor = find<MI_STORE_DATA_IMM *>(cmdList.begin(), cmdList.end());
ASSERT_NE(cmdList.end(), itor);
sdiFromParser1 = genCmdCast<MI_STORE_DATA_IMM *>(*itor);
}
offset = cmdStream->getUsed();
regularCmdList->appendMemoryCopy(&copyData, &copyData, 1, nullptr, 0, nullptr, false, false);
ASSERT_EQ(3u, regularCmdList->inOrderPatchCmds.size()); // SDI + Semaphore + SDI
auto semaphoreFromContainer2 = reinterpret_cast<MI_SEMAPHORE_WAIT *>(regularCmdList->inOrderPatchCmds[1].cmd);
MI_SEMAPHORE_WAIT *semaphoreFromParser2 = nullptr;
auto sdiFromContainer2 = reinterpret_cast<MI_STORE_DATA_IMM *>(regularCmdList->inOrderPatchCmds[2].cmd);
MI_STORE_DATA_IMM *sdiFromParser2 = nullptr;
{
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList,
ptrOffset(cmdStream->getCpuBase(), offset),
(cmdStream->getUsed() - offset)));
auto semaphoreItor = find<MI_SEMAPHORE_WAIT *>(cmdList.begin(), cmdList.end());
ASSERT_NE(cmdList.end(), semaphoreItor);
semaphoreFromParser2 = genCmdCast<MI_SEMAPHORE_WAIT *>(*semaphoreItor);
auto sdiItor = find<MI_STORE_DATA_IMM *>(semaphoreItor, cmdList.end());
ASSERT_NE(cmdList.end(), sdiItor);
sdiFromParser2 = genCmdCast<MI_STORE_DATA_IMM *>(*sdiItor);
}
EXPECT_EQ(2u, regularCmdList->inOrderDependencyCounter);
auto verifyPatching = [&](uint64_t executionCounter) {
auto appendValue = regularCmdList->inOrderDependencyCounter * executionCounter;
EXPECT_EQ(1u + appendValue, sdiFromContainer1->getDataDword0());
EXPECT_EQ(1u + appendValue, sdiFromParser1->getDataDword0());
EXPECT_EQ(1u + appendValue, semaphoreFromContainer2->getSemaphoreDataDword());
EXPECT_EQ(1u + appendValue, semaphoreFromParser2->getSemaphoreDataDword());
EXPECT_EQ(2u + appendValue, sdiFromContainer2->getDataDword0());
EXPECT_EQ(2u + appendValue, sdiFromParser2->getDataDword0());
};
regularCmdList->close();
auto handle = regularCmdList->toHandle();
mockCmdQHw->executeCommandLists(1, &handle, nullptr, false);
verifyPatching(0);
mockCmdQHw->executeCommandLists(1, &handle, nullptr, false);
verifyPatching(1);
mockCmdQHw->executeCommandLists(1, &handle, nullptr, false);
verifyPatching(2);
}
HWTEST2_F(InOrderRegularCmdListTests, givenInOrderModeWhenDispatchingRegularCmdListThenProgramPipeControlsToHandleDependencies, IsAtLeastXeHpCore) {
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER;

View File

@ -253,6 +253,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, OverrideDriverVersion, -1, "-1: default, >=0: Us
DECLARE_DEBUG_VARIABLE(int32_t, WaitForUserFenceOnEventHostSynchronize, -1, "-1: default, 0: Disabled, 1: Enabled. If enabled, use WaitUserFence KMD call for in-order Events instead of active polling on host.")
DECLARE_DEBUG_VARIABLE(int32_t, DisableSystemPointerKernelArgument, -1, "-1: default, 0: Disabled, 1: using a system pointer for kernel argument returns an error.")
DECLARE_DEBUG_VARIABLE(int32_t, ProgramUserInterruptOnResolvedDependency, -1, "-1: default, 0: Disabled, >=1: bitfield. 01b: program after semaphore, 10b: on signaling fence (non-walker append).")
DECLARE_DEBUG_VARIABLE(int32_t, EnableInOrderRegularCmdListPatching, -1, "-1: default, 0: Disabled, 1: If set, patch counter value on execute call")
/*LOGGING FLAGS*/
DECLARE_DEBUG_VARIABLE(int32_t, PrintDriverDiagnostics, -1, "prints driver diagnostics messages to standard output, value corresponds to hint level")

View File

@ -548,5 +548,6 @@ WaitForUserFenceOnEventHostSynchronize = -1
ProgramUserInterruptOnResolvedDependency = -1
DisableSystemPointerKernelArgument = -1
DoNotValidateDriverPath = 0
EnableInOrderRegularCmdListPatching = -1
ForceInOrderEvents = -1
# Please don't edit below this line