fix: improve TBX downloading after L0 Event sync

Related-To: HSD-18038498579

Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com>
This commit is contained in:
Bartosz Dunajski
2024-08-22 14:04:53 +00:00
committed by Compute-Runtime-Automation
parent 4b01058706
commit 696b02bfd3
17 changed files with 168 additions and 21 deletions

View File

@@ -1015,9 +1015,9 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::hostSynchronize(uint6
}
if (this->isTbxMode && (status == ZE_RESULT_SUCCESS)) {
mainQueueCsr->downloadAllocations();
mainQueueCsr->downloadAllocations(true);
if (isCopyOffloadEnabled()) {
copyOffloadCsr->downloadAllocations();
copyOffloadCsr->downloadAllocations(true);
}
}

View File

@@ -59,6 +59,7 @@ struct EventImp : public Event {
protected:
ze_result_t waitForUserFence(uint64_t timeout);
void downloadAllTbxAllocations();
bool handlePreQueryStatusOperationsAndCheckCompletion();
bool tbxDownload(NEO::CommandStreamReceiver &csr, bool &downloadedAllocation, bool &downloadedInOrdedAllocation);

View File

@@ -250,12 +250,38 @@ ze_result_t EventImp<TagSizeT>::queryCounterBasedEventStatus() {
return ZE_RESULT_SUCCESS;
}
template <typename TagSizeT>
void EventImp<TagSizeT>::downloadAllTbxAllocations() {
for (auto &csr : csrs) {
csr->downloadAllocations(true);
}
for (auto &subDevice : this->device->getNEODevice()->getRootDevice()->getSubDevices()) {
for (auto const &engine : subDevice->getAllEngines()) {
auto osContextId = engine.commandStreamReceiver->getOsContext().getContextId();
auto poolAllocation = getPoolAllocation(this->device);
bool isUsed = (poolAllocation && poolAllocation->isUsedByOsContext(osContextId));
if (inOrderExecInfo) {
if (inOrderExecInfo->getDeviceCounterAllocation()) {
isUsed |= inOrderExecInfo->getDeviceCounterAllocation()->isUsedByOsContext(osContextId);
} else {
DEBUG_BREAK_IF(true); // external allocation - not able to download
}
}
if (isUsed) {
engine.commandStreamReceiver->downloadAllocations(false);
}
}
}
}
template <typename TagSizeT>
void EventImp<TagSizeT>::handleSuccessfulHostSynchronization() {
if (this->tbxMode) {
for (auto &csr : csrs) {
csr->downloadAllocations();
}
downloadAllTbxAllocations();
}
this->setIsCompleted();
unsetCmdQueue();

View File

@@ -25,7 +25,7 @@ Fence *Fence::create(CommandQueueImp *cmdQueue, const ze_fence_desc_t *desc) {
ze_result_t Fence::queryStatus() {
auto csr = cmdQueue->getCsr();
csr->downloadAllocations();
csr->downloadAllocations(true);
auto *hostAddr = csr->getTagAddress();

View File

@@ -22,6 +22,7 @@ struct WhiteBox<::L0::Event> : public ::L0::Event {
using BaseClass::counterBasedMode;
using BaseClass::csrs;
using BaseClass::Event;
using BaseClass::eventPoolAllocation;
using BaseClass::gpuHangCheckPeriod;
using BaseClass::hostAddress;
using BaseClass::isFromIpcPool;

View File

@@ -4332,14 +4332,82 @@ HWTEST2_F(EventMultiTileDynamicPacketUseTest, givenEventUsedCreatedOnSubDeviceBu
auto eventAllocation = event->getPoolAllocation(device);
ultCsr0->makeResident(*eventAllocation);
auto hostAddress = static_cast<uint64_t *>(event->getCompletionFieldHostAddress());
*hostAddress = Event::STATE_SIGNALED;
event->hostSynchronize(1);
EXPECT_EQ(1u, ultCsr0->downloadAllocationsCalledCount);
EXPECT_FALSE(ultCsr0->latestDownloadAllocationsBlocking);
EXPECT_EQ(1u, downloadCounter0);
EXPECT_EQ(1u, ultCsr1->downloadAllocationsCalledCount);
EXPECT_TRUE(ultCsr1->latestDownloadAllocationsBlocking);
EXPECT_EQ(0u, downloadCounter1);
event->destroy();
}
HWTEST2_F(EventMultiTileDynamicPacketUseTest, givenEventCounterBasedUsedCreatedOnSubDeviceButUsedOnDifferentSubdeviceWhenQueryingThenDownload, IsAtLeastXeHpCore) {
neoDevice->getExecutionEnvironment()->calculateMaxOsContextCount();
neoDevice->getExecutionEnvironment()->rootDeviceEnvironments[0]->memoryOperationsInterface = std::make_unique<NEO::MockMemoryOperations>();
auto rootDevice = static_cast<MockDeviceImp *>(device);
ASSERT_TRUE(rootDevice->subDevices.size() > 1);
auto subDevice0 = rootDevice->subDevices[0];
auto subDevice1 = rootDevice->subDevices[1];
auto ultCsr0 = static_cast<UltCommandStreamReceiver<FamilyType> *>(subDevice0->getNEODevice()->getDefaultEngine().commandStreamReceiver);
auto ultCsr1 = static_cast<UltCommandStreamReceiver<FamilyType> *>(subDevice1->getNEODevice()->getDefaultEngine().commandStreamReceiver);
ultCsr0->commandStreamReceiverType = CommandStreamReceiverType::tbx;
ultCsr1->commandStreamReceiverType = CommandStreamReceiverType::tbx;
ze_event_pool_desc_t eventPoolDesc = {ZE_STRUCTURE_TYPE_EVENT_POOL_DESC};
eventPoolDesc.count = 2;
ze_event_desc_t eventDesc = {ZE_STRUCTURE_TYPE_EVENT_DESC};
ze_result_t result = ZE_RESULT_SUCCESS;
auto eventPool = std::unique_ptr<L0::EventPool>(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result));
auto event0 = whiteboxCast(getHelper<L0GfxCoreHelper>().createEvent(eventPool.get(), &eventDesc, subDevice1));
auto event1 = whiteboxCast(getHelper<L0GfxCoreHelper>().createEvent(eventPool.get(), &eventDesc, subDevice1));
event0->eventPoolAllocation = nullptr;
event1->eventPoolAllocation = nullptr;
auto inOrderExecInfo0 = NEO::InOrderExecInfo::create(device->getDeviceInOrderCounterAllocator()->getTag(), nullptr, *device->getNEODevice(), 1, false);
inOrderExecInfo0->setLastWaitedCounterValue(1);
event0->updateInOrderExecState(inOrderExecInfo0, 1, 0);
auto inOrderExecInfo1 = NEO::InOrderExecInfo::createFromExternalAllocation(*device->getNEODevice(), 0x1, nullptr, nullptr, 1);
inOrderExecInfo1->setLastWaitedCounterValue(1);
event1->updateInOrderExecState(inOrderExecInfo1, 1, 0);
ultCsr0->makeResident(*inOrderExecInfo0->getDeviceCounterAllocation());
event0->hostSynchronize(1);
EXPECT_EQ(1u, ultCsr0->downloadAllocationsCalledCount);
EXPECT_FALSE(ultCsr0->latestDownloadAllocationsBlocking);
EXPECT_EQ(1u, ultCsr1->downloadAllocationsCalledCount);
EXPECT_TRUE(ultCsr1->latestDownloadAllocationsBlocking);
event1->hostSynchronize(1);
EXPECT_EQ(1u, ultCsr0->downloadAllocationsCalledCount);
EXPECT_FALSE(ultCsr0->latestDownloadAllocationsBlocking);
EXPECT_EQ(2u, ultCsr1->downloadAllocationsCalledCount);
EXPECT_TRUE(ultCsr1->latestDownloadAllocationsBlocking);
event0->destroy();
event1->destroy();
}
HWTEST2_F(EventMultiTileDynamicPacketUseTest, givenDynamicPacketEstimationWhenGettingMaxPacketFromSingleOneTileDeviceThenMaxFromThisDeviceSelected, IsAtLeastXeHpCore) {
testSingleDevice();
}

View File

@@ -178,10 +178,10 @@ bool CommandQueueHw<Family>::waitForTimestamps(Range<CopyEngineState> copyEngine
}
if (waited) {
getGpgpuCommandStreamReceiver().downloadAllocations();
getGpgpuCommandStreamReceiver().downloadAllocations(true);
for (const auto &copyEngine : copyEnginesToWait) {
auto bcsCsr = getBcsCommandStreamReceiver(copyEngine.engineType);
bcsCsr->downloadAllocations();
bcsCsr->downloadAllocations(true);
}
}
}

View File

@@ -832,12 +832,12 @@ bool Event::areTimestampsCompleted() {
}
}
}
this->cmdQueue->getGpgpuCommandStreamReceiver().downloadAllocations();
this->cmdQueue->getGpgpuCommandStreamReceiver().downloadAllocations(true);
const auto &bcsStates = this->cmdQueue->peekActiveBcsStates();
for (auto currentBcsIndex = 0u; currentBcsIndex < bcsStates.size(); currentBcsIndex++) {
const auto &state = bcsStates[currentBcsIndex];
if (state.isValid()) {
this->cmdQueue->getBcsCommandStreamReceiver(state.engineType)->downloadAllocations();
this->cmdQueue->getBcsCommandStreamReceiver(state.engineType)->downloadAllocations(true);
}
}
return true;

View File

@@ -1035,7 +1035,7 @@ bool CommandStreamReceiver::testTaskCountReady(volatile TagAddressType *pollAddr
pollAddress = ptrOffset(pollAddress, this->immWritePostSyncWriteOffset);
}
downloadAllocations();
downloadAllocations(true);
return true;
}

View File

@@ -229,7 +229,7 @@ class CommandStreamReceiver {
virtual WaitStatus waitForCompletionWithTimeout(const WaitParams &params, TaskCountType taskCountToWait);
WaitStatus baseWaitFunction(volatile TagAddressType *pollAddress, const WaitParams &params, TaskCountType taskCountToWait);
MOCKABLE_VIRTUAL bool testTaskCountReady(volatile TagAddressType *pollAddress, TaskCountType taskCountToWait);
virtual void downloadAllocations(){};
virtual void downloadAllocations(bool blockingWait){};
virtual void removeDownloadAllocation(GraphicsAllocation *alloc){};
void setSamplerCacheFlushRequired(SamplerCacheFlushState value) { this->samplerCacheFlushRequired = value; }

View File

@@ -45,7 +45,7 @@ class TbxCommandStreamReceiverHw : public CommandStreamReceiverSimulatedHw<GfxFa
WaitStatus waitForTaskCountWithKmdNotifyFallback(TaskCountType taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, QueueThrottle throttle) override;
WaitStatus waitForCompletionWithTimeout(const WaitParams &params, TaskCountType taskCountToWait) override;
void downloadAllocations() override;
void downloadAllocations(bool blockingWait) override;
void downloadAllocationTbx(GraphicsAllocation &gfxAllocation);
void removeDownloadAllocation(GraphicsAllocation *alloc) override;

View File

@@ -576,19 +576,34 @@ void TbxCommandStreamReceiverHw<GfxFamily>::downloadAllocationTbx(GraphicsAlloca
}
template <typename GfxFamily>
void TbxCommandStreamReceiverHw<GfxFamily>::downloadAllocations() {
void TbxCommandStreamReceiverHw<GfxFamily>::downloadAllocations(bool blockingWait) {
TaskCountType taskCountToWait = this->latestFlushedTaskCount;
volatile TagAddressType *pollAddress = this->getTagAddress();
for (uint32_t i = 0; i < this->activePartitions; i++) {
while (*pollAddress < this->latestFlushedTaskCount) {
while (*pollAddress < taskCountToWait) {
if (!blockingWait) {
return;
}
this->downloadAllocation(*this->getTagAllocation());
}
pollAddress = ptrOffset(pollAddress, this->immWritePostSyncWriteOffset);
}
auto lockCSR = this->obtainUniqueOwnership();
std::vector<GraphicsAllocation *> notReadyAllocations;
for (GraphicsAllocation *graphicsAllocation : this->allocationsForDownload) {
this->downloadAllocation(*graphicsAllocation);
// Used again while waiting for completion. Another download will be needed.
if (graphicsAllocation->getTaskCount(this->osContext->getContextId()) > taskCountToWait) {
notReadyAllocations.push_back(graphicsAllocation);
}
}
this->allocationsForDownload.clear();
this->allocationsForDownload = std::set<GraphicsAllocation *>(notReadyAllocations.begin(), notReadyAllocations.end());
}
template <typename GfxFamily>

View File

@@ -272,8 +272,9 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw<GfxFamily>, publ
}
void setPreemptionAllocation(GraphicsAllocation *allocation) { this->preemptionAllocation = allocation; }
void downloadAllocations() override {
void downloadAllocations(bool blockingWait) override {
downloadAllocationsCalledCount++;
latestDownloadAllocationsBlocking = blockingWait;
}
void downloadAllocationUlt(GraphicsAllocation &gfxAllocation) {
@@ -558,6 +559,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw<GfxFamily>, publ
std::optional<SubmissionStatus> flushReturnValue{};
CommandStreamReceiverType commandStreamReceiverType = CommandStreamReceiverType::hardware;
std::atomic<uint32_t> downloadAllocationsCalledCount = 0;
std::atomic<bool> latestDownloadAllocationsBlocking = false;
bool renderStateCacheFlushed = false;
bool renderStateCacheDcFlushForced = false;

View File

@@ -177,7 +177,7 @@ class MockCommandStreamReceiver : public CommandStreamReceiver {
return commandStreamReceiverType;
}
void downloadAllocations() override {
void downloadAllocations(bool blockingWait) override {
downloadAllocationsCalledCount++;
}

View File

@@ -350,7 +350,7 @@ TEST_F(CommandStreamReceiverTest, givenBaseDownloadAllocationCalledThenDoesNotCh
ASSERT_NE(nullptr, graphicsAllocation);
auto numEvictionAllocsBefore = commandStreamReceiver->getEvictionAllocations().size();
commandStreamReceiver->CommandStreamReceiver::downloadAllocations();
commandStreamReceiver->CommandStreamReceiver::downloadAllocations(true);
auto numEvictionAllocsAfter = commandStreamReceiver->getEvictionAllocations().size();
EXPECT_EQ(numEvictionAllocsBefore, numEvictionAllocsAfter);
EXPECT_EQ(0u, numEvictionAllocsAfter);

View File

@@ -473,14 +473,48 @@ HWTEST_F(TbxCommandSteamSimpleTest, givenTbxCsrWhenDownloadAllocatoinsCalledThen
MockGraphicsAllocation allocation1, allocation2, allocation3;
tbxCsr.allocationsForDownload = {&allocation1, &allocation2, &allocation3};
allocation1.updateTaskCount(0, tbxCsr.getOsContext().getContextId());
allocation2.updateTaskCount(0, tbxCsr.getOsContext().getContextId());
allocation3.updateTaskCount(0, tbxCsr.getOsContext().getContextId());
EXPECT_EQ(0u, tbxCsr.obtainUniqueOwnershipCalled);
tbxCsr.downloadAllocations();
tbxCsr.downloadAllocations(true);
EXPECT_EQ(1u, tbxCsr.obtainUniqueOwnershipCalled);
std::set<GraphicsAllocation *> expectedDownloadedAllocations = {tbxCsr.getTagAllocation(), &allocation1, &allocation2, &allocation3};
EXPECT_EQ(0u, tbxCsr.allocationsForDownload.size());
}
HWTEST_F(TbxCommandSteamSimpleTest, givenTbxCsrWhenUpdatingTaskCountDuringWaitThenDontRemoveFromContainer) {
MockTbxCsrRegisterDownloadedAllocations<FamilyType> tbxCsr{*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()};
MockOsContext osContext(0, EngineDescriptorHelper::getDefaultDescriptor(pDevice->getDeviceBitfield()));
tbxCsr.setupContext(osContext);
tbxCsr.initializeTagAllocation();
*tbxCsr.getTagAddress() = 0u;
tbxCsr.latestFlushedTaskCount = 1;
MockGraphicsAllocation allocation1, allocation2, allocation3;
tbxCsr.allocationsForDownload = {&allocation1, &allocation2, &allocation3};
tbxCsr.makeResident(allocation1);
tbxCsr.makeResident(allocation2);
tbxCsr.makeResident(allocation3);
allocation2.updateTaskCount(2u, tbxCsr.getOsContext().getContextId());
tbxCsr.downloadAllocations(false);
EXPECT_EQ(0u, tbxCsr.obtainUniqueOwnershipCalled);
EXPECT_EQ(3u, tbxCsr.allocationsForDownload.size());
*tbxCsr.getTagAddress() = 1u;
tbxCsr.downloadAllocations(false);
EXPECT_EQ(1u, tbxCsr.obtainUniqueOwnershipCalled);
EXPECT_EQ(1u, tbxCsr.allocationsForDownload.size());
EXPECT_NE(tbxCsr.allocationsForDownload.find(&allocation2), tbxCsr.allocationsForDownload.end());
}
HWTEST_F(TbxCommandSteamSimpleTest, whenTbxCommandStreamReceiverIsCreatedThenPPGTTAndGGTTCreatedHavePhysicalAddressAllocatorSet) {
MockTbxCsr<FamilyType> tbxCsr(*pDevice->executionEnvironment, pDevice->getDeviceBitfield());

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2021-2023 Intel Corporation
* Copyright (C) 2021-2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -11,4 +11,4 @@
#include "shared/test/unit_test/gen12lp/compute_mode_tests_gen12lp.inl"
#include "shared/test/unit_test/gen12lp/tbx_command_stream_receiver_tests_gen12lp.inl"
#include "shared/test/unit_test/gen12lp/test_device_caps_gen12lp.inl"
#include "shared/test/unit_test/gen12lp/test_sample_gen12lp.inl"
#include "shared/test/unit_test/gen12lp/test_sample_gen12lp.inl"