feature: wait only for 1 semaphore in atomic signalling mode

Related-To: NEO-7966

Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com>
This commit is contained in:
Dunajski, Bartosz
2023-12-01 13:20:13 +00:00
committed by Compute-Runtime-Automation
parent 93c2634c93
commit 9c53d9a712
11 changed files with 70 additions and 13 deletions

View File

@@ -340,7 +340,7 @@ struct CommandListCoreFamily : public CommandListImp {
void addCmdForPatching(std::shared_ptr<InOrderExecInfo> *externalInOrderExecInfo, void *cmd1, void *cmd2, uint64_t counterValue, InOrderPatchCommandHelpers::PatchCmdType patchCmdType);
bool inOrderAtomicSignallingEnabled() const;
bool inOrderAtomicSignallingEnabled() const override;
uint64_t getInOrderIncrementValue() const;
InOrderPatchCommandsContainer<GfxFamily> inOrderPatchCmds;

View File

@@ -2420,7 +2420,7 @@ void CommandListCoreFamily<gfxCoreFamily>::appendWaitOnInOrderDependency(std::sh
uint64_t gpuAddress = dependencyCounterAllocation.getGpuAddress() + offset;
for (uint32_t i = 0; i < this->partitionCount; i++) {
for (uint32_t i = 0; i < inOrderExecInfo->getNumDevicePartitionsToWait(); i++) {
if (relaxedOrderingAllowed) {
NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::programConditionalDataMemBatchBufferStart(*commandContainer.getCommandStream(), 0, gpuAddress, waitValue, NEO::CompareOperation::Less, true, isQwordInOrderCounter());

View File

@@ -1284,9 +1284,9 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::synchronizeInOrderExe
bool signaled = true;
auto hostAddress = static_cast<uint64_t *>(ptrOffset(inOrderExecInfo->getDeviceCounterAllocation().getUnderlyingBuffer(), this->inOrderAllocationOffset));
const uint64_t *hostAddress = ptrOffset(inOrderExecInfo->getHostAddress(), this->inOrderAllocationOffset);
for (uint32_t i = 0; i < this->partitionCount; i++) {
for (uint32_t i = 0; i < inOrderExecInfo->getNumHostPartitionsToWait(); i++) {
if (!NEO::WaitUtils::waitFunctionWithPredicate<const uint64_t>(hostAddress, waitValue, std::greater_equal<uint64_t>())) {
signaled = false;
break;

View File

@@ -234,9 +234,7 @@ void CommandListImp::enableInOrderExecution() {
UNRECOVERABLE_IF(!inOrderDependencyCounterAllocation);
memset(inOrderDependencyCounterAllocation->getUnderlyingBuffer(), 0, inOrderDependencyCounterAllocation->getUnderlyingBufferSize());
inOrderExecInfo = std::make_shared<InOrderExecInfo>(*inOrderDependencyCounterAllocation, *device->getMemoryManager(), (this->cmdListType == TYPE_REGULAR));
inOrderExecInfo = std::make_shared<InOrderExecInfo>(*inOrderDependencyCounterAllocation, *device->getMemoryManager(), this->partitionCount, (this->cmdListType == TYPE_REGULAR), inOrderAtomicSignallingEnabled());
}
void CommandListImp::storeReferenceTsToMappedEvents(bool isClearEnabled) {

View File

@@ -44,6 +44,8 @@ struct CommandListImp : public CommandList {
~CommandListImp() override = default;
virtual bool inOrderAtomicSignallingEnabled() const = 0;
static constexpr int32_t cmdListDefaultEngineInstancedDevice = NEO::StreamProperty::initValue;
static constexpr bool cmdListDefaultCoherency = false;
static constexpr bool cmdListDefaultDisableOverdispatch = true;

View File

@@ -163,11 +163,11 @@ ze_result_t EventImp<TagSizeT>::queryCounterBasedEventStatus() {
return ZE_RESULT_NOT_READY;
}
auto hostAddress = static_cast<uint64_t *>(ptrOffset(inOrderExecInfo->getDeviceCounterAllocation().getUnderlyingBuffer(), this->inOrderAllocationOffset));
const uint64_t *hostAddress = ptrOffset(inOrderExecInfo->getHostAddress(), this->inOrderAllocationOffset);
auto waitValue = getInOrderExecSignalValueWithSubmissionCounter();
bool signaled = true;
for (uint32_t i = 0; i < this->getPacketsInUse(); i++) {
for (uint32_t i = 0; i < inOrderExecInfo->getNumHostPartitionsToWait(); i++) {
if (!NEO::WaitUtils::waitFunctionWithPredicate<const uint64_t>(hostAddress, waitValue, std::greater_equal<uint64_t>())) {
signaled = false;
break;
@@ -438,7 +438,7 @@ ze_result_t EventImp<TagSizeT>::waitForUserFence(uint64_t timeout) {
return ZE_RESULT_NOT_READY;
}
uint64_t waitAddress = castToUint64(ptrOffset(inOrderExecInfo->getDeviceCounterAllocation().getUnderlyingBuffer(), this->inOrderAllocationOffset));
uint64_t waitAddress = castToUint64(ptrOffset(inOrderExecInfo->getHostAddress(), this->inOrderAllocationOffset));
if (!csrs[0]->waitUserFence(getInOrderExecSignalValueWithSubmissionCounter(), waitAddress, timeout)) {
return ZE_RESULT_NOT_READY;

View File

@@ -19,8 +19,15 @@ InOrderExecInfo::~InOrderExecInfo() {
memoryManager.freeGraphicsMemory(&deviceCounterAllocation);
}
InOrderExecInfo::InOrderExecInfo(NEO::GraphicsAllocation &deviceCounterAllocation, NEO::MemoryManager &memoryManager, bool regularCmdList)
InOrderExecInfo::InOrderExecInfo(NEO::GraphicsAllocation &deviceCounterAllocation, NEO::MemoryManager &memoryManager, uint32_t partitionCount, bool regularCmdList, bool atomicDeviceSignalling)
: deviceCounterAllocation(deviceCounterAllocation), memoryManager(memoryManager), regularCmdList(regularCmdList) {
numDevicePartitionsToWait = atomicDeviceSignalling ? 1 : partitionCount;
numHostPartitionsToWait = partitionCount;
hostAddress = reinterpret_cast<uint64_t *>(deviceCounterAllocation.getUnderlyingBuffer());
reset();
}
void InOrderExecInfo::reset() {

View File

@@ -27,9 +27,10 @@ class InOrderExecInfo : public NEO::NonCopyableClass {
InOrderExecInfo() = delete;
InOrderExecInfo(NEO::GraphicsAllocation &deviceCounterAllocation, NEO::MemoryManager &memoryManager, bool regularCmdList);
InOrderExecInfo(NEO::GraphicsAllocation &deviceCounterAllocation, NEO::MemoryManager &memoryManager, uint32_t partitionCount, bool regularCmdList, bool atomicDeviceSignalling);
NEO::GraphicsAllocation &getDeviceCounterAllocation() const { return deviceCounterAllocation; }
uint64_t *getHostAddress() const { return hostAddress; }
uint64_t getCounterValue() const { return counterValue; }
void addCounterValue(uint64_t addValue) { counterValue += addValue; }
@@ -40,6 +41,9 @@ class InOrderExecInfo : public NEO::NonCopyableClass {
bool isRegularCmdList() const { return regularCmdList; }
uint32_t getNumDevicePartitionsToWait() const { return numDevicePartitionsToWait; }
uint32_t getNumHostPartitionsToWait() const { return numHostPartitionsToWait; }
void reset();
protected:
@@ -47,6 +51,9 @@ class InOrderExecInfo : public NEO::NonCopyableClass {
NEO::MemoryManager &memoryManager;
uint64_t counterValue = 0;
uint64_t regularCmdListSubmissionCounter = 0;
uint64_t *hostAddress = nullptr;
uint32_t numDevicePartitionsToWait = 0;
uint32_t numHostPartitionsToWait = 0;
bool regularCmdList = false;
};

View File

@@ -293,6 +293,7 @@ struct MockCommandList : public CommandList {
ADDMETHOD_NOBASE(close, ze_result_t, ZE_RESULT_SUCCESS, ());
ADDMETHOD_NOBASE(destroy, ze_result_t, ZE_RESULT_SUCCESS, ());
ADDMETHOD_NOBASE_VOIDRETURN(patchInOrderCmds, (void));
ADDMETHOD_CONST_NOBASE(inOrderAtomicSignallingEnabled, bool, false, (void));
ADDMETHOD_NOBASE(appendLaunchKernel, ze_result_t, ZE_RESULT_SUCCESS,
(ze_kernel_handle_t kernelHandle,

View File

@@ -4164,6 +4164,47 @@ HWTEST2_F(MultiTileInOrderCmdListTests, givenAtomicSignallingEnabledWhenSignalli
EXPECT_EQ(0u, atomicCmd->getCsStall());
}
HWTEST2_F(MultiTileInOrderCmdListTests, givenAtomicSignallingEnabledWhenWaitingForDependencyThenUseOnlyOneSemaphore, IsAtLeastXeHpCore) {
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
debugManager.flags.InOrderAtomicSignallingEnabled.set(1);
auto immCmdList1 = createMultiTileImmCmdList<gfxCoreFamily>();
auto immCmdList2 = createMultiTileImmCmdList<gfxCoreFamily>();
auto eventPool = createEvents<FamilyType>(1, false);
auto handle = events[0]->toHandle();
immCmdList1->appendLaunchKernel(kernel->toHandle(), groupCount, handle, 0, nullptr, launchParams, false);
EXPECT_EQ(partitionCount, immCmdList1->inOrderExecInfo->getCounterValue());
auto cmdStream = immCmdList2->getCmdContainer().getCommandStream();
immCmdList2->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false);
size_t offset = cmdStream->getUsed();
immCmdList2->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 1, &handle, launchParams, false);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList, ptrOffset(cmdStream->getCpuBase(), offset), (cmdStream->getUsed() - offset)));
auto semaphores = findAll<MI_SEMAPHORE_WAIT *>(cmdList.begin(), cmdList.end());
ASSERT_EQ(2u + (ImplicitScalingDispatch<FamilyType>::getPipeControlStallRequired() ? 1 : 0), semaphores.size());
auto itor = cmdList.begin();
// implicit dependency
auto gpuAddress = immCmdList2->inOrderExecInfo->getDeviceCounterAllocation().getGpuAddress();
ASSERT_TRUE(verifyInOrderDependency<FamilyType>(itor, partitionCount, gpuAddress, immCmdList2->isQwordInOrderCounter()));
// event
ASSERT_TRUE(verifyInOrderDependency<FamilyType>(itor, partitionCount, events[0]->inOrderExecInfo->getDeviceCounterAllocation().getGpuAddress(), immCmdList2->isQwordInOrderCounter()));
}
HWTEST2_F(MultiTileInOrderCmdListTests, givenMultiTileInOrderModeWhenProgrammingWaitOnEventsThenHandleAllEventPackets, IsAtLeastXeHpCore) {
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;

View File

@@ -3274,7 +3274,8 @@ HWTEST_F(EventTests, givenInOrderEventWhenHostSynchronizeIsCalledThenAllocationI
auto syncAllocation = new NEO::MockGraphicsAllocation(&storage, sizeof(storage));
auto inOrderExecInfo = std::make_shared<InOrderExecInfo>(*syncAllocation, *neoDevice->getMemoryManager(), false);
auto inOrderExecInfo = std::make_shared<InOrderExecInfo>(*syncAllocation, *neoDevice->getMemoryManager(), 1, false, false);
*inOrderExecInfo->getHostAddress() = 1;
event->enableCounterBasedMode(true);
event->updateInOrderExecState(inOrderExecInfo, 1, 0);