diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl index c656ff8fb0..b31ef54d6b 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl @@ -1015,9 +1015,9 @@ ze_result_t CommandListCoreFamilyImmediate::hostSynchronize(uint6 } if (this->isTbxMode && (status == ZE_RESULT_SUCCESS)) { - mainQueueCsr->downloadAllocations(); + mainQueueCsr->downloadAllocations(true); if (isCopyOffloadEnabled()) { - copyOffloadCsr->downloadAllocations(); + copyOffloadCsr->downloadAllocations(true); } } diff --git a/level_zero/core/source/event/event_imp.h b/level_zero/core/source/event/event_imp.h index d1ce31fb92..e25d2244e6 100644 --- a/level_zero/core/source/event/event_imp.h +++ b/level_zero/core/source/event/event_imp.h @@ -59,6 +59,7 @@ struct EventImp : public Event { protected: ze_result_t waitForUserFence(uint64_t timeout); + void downloadAllTbxAllocations(); bool handlePreQueryStatusOperationsAndCheckCompletion(); bool tbxDownload(NEO::CommandStreamReceiver &csr, bool &downloadedAllocation, bool &downloadedInOrdedAllocation); diff --git a/level_zero/core/source/event/event_impl.inl b/level_zero/core/source/event/event_impl.inl index 7d801db6f8..f9ed57ad1d 100644 --- a/level_zero/core/source/event/event_impl.inl +++ b/level_zero/core/source/event/event_impl.inl @@ -250,12 +250,38 @@ ze_result_t EventImp::queryCounterBasedEventStatus() { return ZE_RESULT_SUCCESS; } +template +void EventImp::downloadAllTbxAllocations() { + for (auto &csr : csrs) { + csr->downloadAllocations(true); + } + + for (auto &subDevice : this->device->getNEODevice()->getRootDevice()->getSubDevices()) { + for (auto const &engine : subDevice->getAllEngines()) { + auto osContextId = engine.commandStreamReceiver->getOsContext().getContextId(); + + auto poolAllocation = getPoolAllocation(this->device); + bool isUsed = (poolAllocation && poolAllocation->isUsedByOsContext(osContextId)); + + if (inOrderExecInfo) { + if (inOrderExecInfo->getDeviceCounterAllocation()) { + isUsed |= inOrderExecInfo->getDeviceCounterAllocation()->isUsedByOsContext(osContextId); + } else { + DEBUG_BREAK_IF(true); // external allocation - not able to download + } + } + + if (isUsed) { + engine.commandStreamReceiver->downloadAllocations(false); + } + } + } +} + template void EventImp::handleSuccessfulHostSynchronization() { if (this->tbxMode) { - for (auto &csr : csrs) { - csr->downloadAllocations(); - } + downloadAllTbxAllocations(); } this->setIsCompleted(); unsetCmdQueue(); diff --git a/level_zero/core/source/fence/fence.cpp b/level_zero/core/source/fence/fence.cpp index f944910094..86018244ad 100644 --- a/level_zero/core/source/fence/fence.cpp +++ b/level_zero/core/source/fence/fence.cpp @@ -25,7 +25,7 @@ Fence *Fence::create(CommandQueueImp *cmdQueue, const ze_fence_desc_t *desc) { ze_result_t Fence::queryStatus() { auto csr = cmdQueue->getCsr(); - csr->downloadAllocations(); + csr->downloadAllocations(true); auto *hostAddr = csr->getTagAddress(); diff --git a/level_zero/core/test/unit_tests/mocks/mock_event.h b/level_zero/core/test/unit_tests/mocks/mock_event.h index fca0863e4e..390c45fe20 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_event.h +++ b/level_zero/core/test/unit_tests/mocks/mock_event.h @@ -22,6 +22,7 @@ struct WhiteBox<::L0::Event> : public ::L0::Event { using BaseClass::counterBasedMode; using BaseClass::csrs; using BaseClass::Event; + using BaseClass::eventPoolAllocation; using BaseClass::gpuHangCheckPeriod; using BaseClass::hostAddress; using BaseClass::isFromIpcPool; diff --git a/level_zero/core/test/unit_tests/sources/event/test_event.cpp b/level_zero/core/test/unit_tests/sources/event/test_event.cpp index d5758efea9..44aace4098 100644 --- a/level_zero/core/test/unit_tests/sources/event/test_event.cpp +++ b/level_zero/core/test/unit_tests/sources/event/test_event.cpp @@ -4332,14 +4332,82 @@ HWTEST2_F(EventMultiTileDynamicPacketUseTest, givenEventUsedCreatedOnSubDeviceBu auto eventAllocation = event->getPoolAllocation(device); ultCsr0->makeResident(*eventAllocation); + auto hostAddress = static_cast(event->getCompletionFieldHostAddress()); + *hostAddress = Event::STATE_SIGNALED; + event->hostSynchronize(1); + EXPECT_EQ(1u, ultCsr0->downloadAllocationsCalledCount); + EXPECT_FALSE(ultCsr0->latestDownloadAllocationsBlocking); EXPECT_EQ(1u, downloadCounter0); + + EXPECT_EQ(1u, ultCsr1->downloadAllocationsCalledCount); + EXPECT_TRUE(ultCsr1->latestDownloadAllocationsBlocking); EXPECT_EQ(0u, downloadCounter1); event->destroy(); } +HWTEST2_F(EventMultiTileDynamicPacketUseTest, givenEventCounterBasedUsedCreatedOnSubDeviceButUsedOnDifferentSubdeviceWhenQueryingThenDownload, IsAtLeastXeHpCore) { + neoDevice->getExecutionEnvironment()->calculateMaxOsContextCount(); + + neoDevice->getExecutionEnvironment()->rootDeviceEnvironments[0]->memoryOperationsInterface = std::make_unique(); + + auto rootDevice = static_cast(device); + + ASSERT_TRUE(rootDevice->subDevices.size() > 1); + + auto subDevice0 = rootDevice->subDevices[0]; + auto subDevice1 = rootDevice->subDevices[1]; + + auto ultCsr0 = static_cast *>(subDevice0->getNEODevice()->getDefaultEngine().commandStreamReceiver); + auto ultCsr1 = static_cast *>(subDevice1->getNEODevice()->getDefaultEngine().commandStreamReceiver); + + ultCsr0->commandStreamReceiverType = CommandStreamReceiverType::tbx; + ultCsr1->commandStreamReceiverType = CommandStreamReceiverType::tbx; + + ze_event_pool_desc_t eventPoolDesc = {ZE_STRUCTURE_TYPE_EVENT_POOL_DESC}; + eventPoolDesc.count = 2; + ze_event_desc_t eventDesc = {ZE_STRUCTURE_TYPE_EVENT_DESC}; + + ze_result_t result = ZE_RESULT_SUCCESS; + auto eventPool = std::unique_ptr(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result)); + + auto event0 = whiteboxCast(getHelper().createEvent(eventPool.get(), &eventDesc, subDevice1)); + auto event1 = whiteboxCast(getHelper().createEvent(eventPool.get(), &eventDesc, subDevice1)); + event0->eventPoolAllocation = nullptr; + event1->eventPoolAllocation = nullptr; + + auto inOrderExecInfo0 = NEO::InOrderExecInfo::create(device->getDeviceInOrderCounterAllocator()->getTag(), nullptr, *device->getNEODevice(), 1, false); + inOrderExecInfo0->setLastWaitedCounterValue(1); + event0->updateInOrderExecState(inOrderExecInfo0, 1, 0); + + auto inOrderExecInfo1 = NEO::InOrderExecInfo::createFromExternalAllocation(*device->getNEODevice(), 0x1, nullptr, nullptr, 1); + inOrderExecInfo1->setLastWaitedCounterValue(1); + event1->updateInOrderExecState(inOrderExecInfo1, 1, 0); + + ultCsr0->makeResident(*inOrderExecInfo0->getDeviceCounterAllocation()); + + event0->hostSynchronize(1); + + EXPECT_EQ(1u, ultCsr0->downloadAllocationsCalledCount); + EXPECT_FALSE(ultCsr0->latestDownloadAllocationsBlocking); + + EXPECT_EQ(1u, ultCsr1->downloadAllocationsCalledCount); + EXPECT_TRUE(ultCsr1->latestDownloadAllocationsBlocking); + + event1->hostSynchronize(1); + + EXPECT_EQ(1u, ultCsr0->downloadAllocationsCalledCount); + EXPECT_FALSE(ultCsr0->latestDownloadAllocationsBlocking); + + EXPECT_EQ(2u, ultCsr1->downloadAllocationsCalledCount); + EXPECT_TRUE(ultCsr1->latestDownloadAllocationsBlocking); + + event0->destroy(); + event1->destroy(); +} + HWTEST2_F(EventMultiTileDynamicPacketUseTest, givenDynamicPacketEstimationWhenGettingMaxPacketFromSingleOneTileDeviceThenMaxFromThisDeviceSelected, IsAtLeastXeHpCore) { testSingleDevice(); } diff --git a/opencl/source/command_queue/command_queue_hw_base.inl b/opencl/source/command_queue/command_queue_hw_base.inl index 4ee4cd0fe0..fa7a6842e3 100644 --- a/opencl/source/command_queue/command_queue_hw_base.inl +++ b/opencl/source/command_queue/command_queue_hw_base.inl @@ -178,10 +178,10 @@ bool CommandQueueHw::waitForTimestamps(Range copyEngine } if (waited) { - getGpgpuCommandStreamReceiver().downloadAllocations(); + getGpgpuCommandStreamReceiver().downloadAllocations(true); for (const auto ©Engine : copyEnginesToWait) { auto bcsCsr = getBcsCommandStreamReceiver(copyEngine.engineType); - bcsCsr->downloadAllocations(); + bcsCsr->downloadAllocations(true); } } } diff --git a/opencl/source/event/event.cpp b/opencl/source/event/event.cpp index bb32a9c62b..eb338d5de1 100644 --- a/opencl/source/event/event.cpp +++ b/opencl/source/event/event.cpp @@ -832,12 +832,12 @@ bool Event::areTimestampsCompleted() { } } } - this->cmdQueue->getGpgpuCommandStreamReceiver().downloadAllocations(); + this->cmdQueue->getGpgpuCommandStreamReceiver().downloadAllocations(true); const auto &bcsStates = this->cmdQueue->peekActiveBcsStates(); for (auto currentBcsIndex = 0u; currentBcsIndex < bcsStates.size(); currentBcsIndex++) { const auto &state = bcsStates[currentBcsIndex]; if (state.isValid()) { - this->cmdQueue->getBcsCommandStreamReceiver(state.engineType)->downloadAllocations(); + this->cmdQueue->getBcsCommandStreamReceiver(state.engineType)->downloadAllocations(true); } } return true; diff --git a/shared/source/command_stream/command_stream_receiver.cpp b/shared/source/command_stream/command_stream_receiver.cpp index 0986cb01d3..bb2f632476 100644 --- a/shared/source/command_stream/command_stream_receiver.cpp +++ b/shared/source/command_stream/command_stream_receiver.cpp @@ -1035,7 +1035,7 @@ bool CommandStreamReceiver::testTaskCountReady(volatile TagAddressType *pollAddr pollAddress = ptrOffset(pollAddress, this->immWritePostSyncWriteOffset); } - downloadAllocations(); + downloadAllocations(true); return true; } diff --git a/shared/source/command_stream/command_stream_receiver.h b/shared/source/command_stream/command_stream_receiver.h index 0cddacdfc2..eb5aaf6dce 100644 --- a/shared/source/command_stream/command_stream_receiver.h +++ b/shared/source/command_stream/command_stream_receiver.h @@ -229,7 +229,7 @@ class CommandStreamReceiver { virtual WaitStatus waitForCompletionWithTimeout(const WaitParams ¶ms, TaskCountType taskCountToWait); WaitStatus baseWaitFunction(volatile TagAddressType *pollAddress, const WaitParams ¶ms, TaskCountType taskCountToWait); MOCKABLE_VIRTUAL bool testTaskCountReady(volatile TagAddressType *pollAddress, TaskCountType taskCountToWait); - virtual void downloadAllocations(){}; + virtual void downloadAllocations(bool blockingWait){}; virtual void removeDownloadAllocation(GraphicsAllocation *alloc){}; void setSamplerCacheFlushRequired(SamplerCacheFlushState value) { this->samplerCacheFlushRequired = value; } diff --git a/shared/source/command_stream/tbx_command_stream_receiver_hw.h b/shared/source/command_stream/tbx_command_stream_receiver_hw.h index 56ac1643c6..25d84dc544 100644 --- a/shared/source/command_stream/tbx_command_stream_receiver_hw.h +++ b/shared/source/command_stream/tbx_command_stream_receiver_hw.h @@ -45,7 +45,7 @@ class TbxCommandStreamReceiverHw : public CommandStreamReceiverSimulatedHw::downloadAllocationTbx(GraphicsAlloca } template -void TbxCommandStreamReceiverHw::downloadAllocations() { +void TbxCommandStreamReceiverHw::downloadAllocations(bool blockingWait) { + TaskCountType taskCountToWait = this->latestFlushedTaskCount; + volatile TagAddressType *pollAddress = this->getTagAddress(); + for (uint32_t i = 0; i < this->activePartitions; i++) { - while (*pollAddress < this->latestFlushedTaskCount) { + while (*pollAddress < taskCountToWait) { + if (!blockingWait) { + return; + } this->downloadAllocation(*this->getTagAllocation()); } pollAddress = ptrOffset(pollAddress, this->immWritePostSyncWriteOffset); } auto lockCSR = this->obtainUniqueOwnership(); + + std::vector notReadyAllocations; + for (GraphicsAllocation *graphicsAllocation : this->allocationsForDownload) { this->downloadAllocation(*graphicsAllocation); + + // Used again while waiting for completion. Another download will be needed. + if (graphicsAllocation->getTaskCount(this->osContext->getContextId()) > taskCountToWait) { + notReadyAllocations.push_back(graphicsAllocation); + } } this->allocationsForDownload.clear(); + this->allocationsForDownload = std::set(notReadyAllocations.begin(), notReadyAllocations.end()); } template diff --git a/shared/test/common/libult/ult_command_stream_receiver.h b/shared/test/common/libult/ult_command_stream_receiver.h index c44bfaeac2..e5d156dcb9 100644 --- a/shared/test/common/libult/ult_command_stream_receiver.h +++ b/shared/test/common/libult/ult_command_stream_receiver.h @@ -272,8 +272,9 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw, publ } void setPreemptionAllocation(GraphicsAllocation *allocation) { this->preemptionAllocation = allocation; } - void downloadAllocations() override { + void downloadAllocations(bool blockingWait) override { downloadAllocationsCalledCount++; + latestDownloadAllocationsBlocking = blockingWait; } void downloadAllocationUlt(GraphicsAllocation &gfxAllocation) { @@ -558,6 +559,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw, publ std::optional flushReturnValue{}; CommandStreamReceiverType commandStreamReceiverType = CommandStreamReceiverType::hardware; std::atomic downloadAllocationsCalledCount = 0; + std::atomic latestDownloadAllocationsBlocking = false; bool renderStateCacheFlushed = false; bool renderStateCacheDcFlushForced = false; diff --git a/shared/test/common/mocks/mock_command_stream_receiver.h b/shared/test/common/mocks/mock_command_stream_receiver.h index 1528d89b29..c147046460 100644 --- a/shared/test/common/mocks/mock_command_stream_receiver.h +++ b/shared/test/common/mocks/mock_command_stream_receiver.h @@ -177,7 +177,7 @@ class MockCommandStreamReceiver : public CommandStreamReceiver { return commandStreamReceiverType; } - void downloadAllocations() override { + void downloadAllocations(bool blockingWait) override { downloadAllocationsCalledCount++; } diff --git a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp index 79fb7481ef..324470fe90 100644 --- a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp +++ b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp @@ -350,7 +350,7 @@ TEST_F(CommandStreamReceiverTest, givenBaseDownloadAllocationCalledThenDoesNotCh ASSERT_NE(nullptr, graphicsAllocation); auto numEvictionAllocsBefore = commandStreamReceiver->getEvictionAllocations().size(); - commandStreamReceiver->CommandStreamReceiver::downloadAllocations(); + commandStreamReceiver->CommandStreamReceiver::downloadAllocations(true); auto numEvictionAllocsAfter = commandStreamReceiver->getEvictionAllocations().size(); EXPECT_EQ(numEvictionAllocsBefore, numEvictionAllocsAfter); EXPECT_EQ(0u, numEvictionAllocsAfter); diff --git a/shared/test/unit_test/command_stream/tbx_command_stream_tests.cpp b/shared/test/unit_test/command_stream/tbx_command_stream_tests.cpp index 03969f63a0..75f768a588 100644 --- a/shared/test/unit_test/command_stream/tbx_command_stream_tests.cpp +++ b/shared/test/unit_test/command_stream/tbx_command_stream_tests.cpp @@ -473,14 +473,48 @@ HWTEST_F(TbxCommandSteamSimpleTest, givenTbxCsrWhenDownloadAllocatoinsCalledThen MockGraphicsAllocation allocation1, allocation2, allocation3; tbxCsr.allocationsForDownload = {&allocation1, &allocation2, &allocation3}; + allocation1.updateTaskCount(0, tbxCsr.getOsContext().getContextId()); + allocation2.updateTaskCount(0, tbxCsr.getOsContext().getContextId()); + allocation3.updateTaskCount(0, tbxCsr.getOsContext().getContextId()); + EXPECT_EQ(0u, tbxCsr.obtainUniqueOwnershipCalled); - tbxCsr.downloadAllocations(); + tbxCsr.downloadAllocations(true); EXPECT_EQ(1u, tbxCsr.obtainUniqueOwnershipCalled); std::set expectedDownloadedAllocations = {tbxCsr.getTagAllocation(), &allocation1, &allocation2, &allocation3}; EXPECT_EQ(0u, tbxCsr.allocationsForDownload.size()); } +HWTEST_F(TbxCommandSteamSimpleTest, givenTbxCsrWhenUpdatingTaskCountDuringWaitThenDontRemoveFromContainer) { + MockTbxCsrRegisterDownloadedAllocations tbxCsr{*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()}; + MockOsContext osContext(0, EngineDescriptorHelper::getDefaultDescriptor(pDevice->getDeviceBitfield())); + + tbxCsr.setupContext(osContext); + tbxCsr.initializeTagAllocation(); + *tbxCsr.getTagAddress() = 0u; + tbxCsr.latestFlushedTaskCount = 1; + + MockGraphicsAllocation allocation1, allocation2, allocation3; + tbxCsr.allocationsForDownload = {&allocation1, &allocation2, &allocation3}; + + tbxCsr.makeResident(allocation1); + tbxCsr.makeResident(allocation2); + tbxCsr.makeResident(allocation3); + + allocation2.updateTaskCount(2u, tbxCsr.getOsContext().getContextId()); + + tbxCsr.downloadAllocations(false); + EXPECT_EQ(0u, tbxCsr.obtainUniqueOwnershipCalled); + EXPECT_EQ(3u, tbxCsr.allocationsForDownload.size()); + + *tbxCsr.getTagAddress() = 1u; + + tbxCsr.downloadAllocations(false); + EXPECT_EQ(1u, tbxCsr.obtainUniqueOwnershipCalled); + EXPECT_EQ(1u, tbxCsr.allocationsForDownload.size()); + EXPECT_NE(tbxCsr.allocationsForDownload.find(&allocation2), tbxCsr.allocationsForDownload.end()); +} + HWTEST_F(TbxCommandSteamSimpleTest, whenTbxCommandStreamReceiverIsCreatedThenPPGTTAndGGTTCreatedHavePhysicalAddressAllocatorSet) { MockTbxCsr tbxCsr(*pDevice->executionEnvironment, pDevice->getDeviceBitfield()); diff --git a/shared/test/unit_test/gen12lp/gen12lp_shared_tests_wrapper.cpp b/shared/test/unit_test/gen12lp/gen12lp_shared_tests_wrapper.cpp index aeeaeddebd..88d317f31d 100644 --- a/shared/test/unit_test/gen12lp/gen12lp_shared_tests_wrapper.cpp +++ b/shared/test/unit_test/gen12lp/gen12lp_shared_tests_wrapper.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2021-2023 Intel Corporation + * Copyright (C) 2021-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -11,4 +11,4 @@ #include "shared/test/unit_test/gen12lp/compute_mode_tests_gen12lp.inl" #include "shared/test/unit_test/gen12lp/tbx_command_stream_receiver_tests_gen12lp.inl" #include "shared/test/unit_test/gen12lp/test_device_caps_gen12lp.inl" -#include "shared/test/unit_test/gen12lp/test_sample_gen12lp.inl" \ No newline at end of file +#include "shared/test/unit_test/gen12lp/test_sample_gen12lp.inl"