From 54eee2a88bf5c6c13b784f4ccaa3233c8e427589 Mon Sep 17 00:00:00 2001 From: Maciej Plewka Date: Thu, 21 Apr 2022 08:51:02 +0000 Subject: [PATCH] Fix multi thread usage of external host alloc With this commit OpenCL will track if external host memory is used from few threads and will secure to update task count in all threads before destroing allocation. Resolves: NEO-6807 Signed-off-by: Maciej Plewka --- .../internal_allocation_storage_tests.cpp | 22 ++++++ .../command_stream_receiver.cpp | 3 +- .../command_stream_receiver_hw_base.inl | 3 +- .../memory_manager/graphics_allocation.cpp | 14 +++- .../memory_manager/graphics_allocation.h | 8 ++- .../internal_allocation_storage.cpp | 2 +- shared/source/memory_manager/surface.h | 4 +- .../libult/ult_command_stream_receiver.h | 5 +- .../mocks/mock_command_stream_receiver.h | 8 ++- .../common/mocks/mock_graphics_allocation.h | 14 ++++ .../command_stream_receiver_tests.cpp | 44 ++++++++++++ .../graphics_allocation_tests.cpp | 68 ++++++++++++++++++- 12 files changed, 185 insertions(+), 10 deletions(-) diff --git a/opencl/test/unit_test/memory_manager/internal_allocation_storage_tests.cpp b/opencl/test/unit_test/memory_manager/internal_allocation_storage_tests.cpp index 28b8254959..8ccbbdb484 100644 --- a/opencl/test/unit_test/memory_manager/internal_allocation_storage_tests.cpp +++ b/opencl/test/unit_test/memory_manager/internal_allocation_storage_tests.cpp @@ -297,3 +297,25 @@ HWTEST_F(InternalAllocationStorageTest, givenMultipleActivePartitionsWhenDetachi memoryManager->freeGraphicsMemory(allocationReusable.release()); } +TEST_F(InternalAllocationStorageTest, givenInternalAllocationWhenTaskCountMetsExpectationAndItHasBeenAssignedThenAllocIsRemoved) { + auto allocation = memoryManager->allocateGraphicsMemoryWithProperties(MockAllocationProperties{csr->getRootDeviceIndex(), MemoryConstants::pageSize}); + uint32_t expectedTaskCount = 10u; + *csr->getTagAddress() = expectedTaskCount; + allocation->updateTaskCount(expectedTaskCount, csr->getOsContext().getContextId()); + allocation->hostPtrTaskCountAssignment = 0; + storage->storeAllocation(std::unique_ptr(allocation), TEMPORARY_ALLOCATION); + storage->cleanAllocationList(expectedTaskCount, TEMPORARY_ALLOCATION); + EXPECT_TRUE(csr->getTemporaryAllocations().peekIsEmpty()); +} + +TEST_F(InternalAllocationStorageTest, givenInternalAllocationWhenTaskCountMetsExpectationAndItHasNotBeenAssignedThenAllocIsNotRemoved) { + auto allocation = memoryManager->allocateGraphicsMemoryWithProperties(MockAllocationProperties{csr->getRootDeviceIndex(), MemoryConstants::pageSize}); + uint32_t expectedTaskCount = 10u; + *csr->getTagAddress() = expectedTaskCount; + allocation->updateTaskCount(expectedTaskCount, csr->getOsContext().getContextId()); + allocation->hostPtrTaskCountAssignment = 1; + storage->storeAllocation(std::unique_ptr(allocation), TEMPORARY_ALLOCATION); + storage->cleanAllocationList(expectedTaskCount, TEMPORARY_ALLOCATION); + EXPECT_FALSE(csr->getTemporaryAllocations().peekIsEmpty()); + allocation->hostPtrTaskCountAssignment = 0; +} \ No newline at end of file diff --git a/shared/source/command_stream/command_stream_receiver.cpp b/shared/source/command_stream/command_stream_receiver.cpp index fc13c9d607..9dd5e38a35 100644 --- a/shared/source/command_stream/command_stream_receiver.cpp +++ b/shared/source/command_stream/command_stream_receiver.cpp @@ -734,7 +734,8 @@ bool CommandStreamReceiver::createAllocationForHostSurface(HostPtrSurface &surfa if (allocation == nullptr) { return false; } - allocation->updateTaskCount(CompletionStamp::notReady, osContext->getContextId()); + allocation->hostPtrTaskCountAssignment++; + allocation->updateTaskCount(0u, osContext->getContextId()); surface.setAllocation(allocation.get()); internalAllocationStorage->storeAllocation(std::move(allocation), TEMPORARY_ALLOCATION); return true; diff --git a/shared/source/command_stream/command_stream_receiver_hw_base.inl b/shared/source/command_stream/command_stream_receiver_hw_base.inl index 873a385b69..2673d8f8af 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_base.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_base.inl @@ -1072,7 +1072,8 @@ uint32_t CommandStreamReceiverHw::flushBcsTask(const BlitPropertiesCo } blitProperties.csrDependencies.makeResident(*this); - + blitProperties.srcAllocation->prepareHostPtrForResidency(this); + blitProperties.dstAllocation->prepareHostPtrForResidency(this); makeResident(*blitProperties.srcAllocation); makeResident(*blitProperties.dstAllocation); if (blitProperties.clearColorAllocation) { diff --git a/shared/source/memory_manager/graphics_allocation.cpp b/shared/source/memory_manager/graphics_allocation.cpp index 62821d6d54..825a7da816 100644 --- a/shared/source/memory_manager/graphics_allocation.cpp +++ b/shared/source/memory_manager/graphics_allocation.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2021 Intel Corporation + * Copyright (C) 2018-2022 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -7,6 +7,7 @@ #include "graphics_allocation.h" +#include "shared/source/command_stream/command_stream_receiver.h" #include "shared/source/gmm_helper/gmm.h" #include "shared/source/gmm_helper/gmm_helper.h" #include "shared/source/helpers/aligned_memory.h" @@ -102,6 +103,17 @@ bool GraphicsAllocation::isTbxWritable(uint32_t banks) const { return isAnyBitSet(aubInfo.tbxWritable, banks); } +void GraphicsAllocation::prepareHostPtrForResidency(CommandStreamReceiver *csr) { + if (hostPtrTaskCountAssignment > 0) { + auto allocTaskCount = getTaskCount(csr->getOsContext().getContextId()); + auto currentTaskCount = *csr->getTagAddress() + 1; + if (currentTaskCount > allocTaskCount) { + updateTaskCount(currentTaskCount, csr->getOsContext().getContextId()); + hostPtrTaskCountAssignment--; + } + } +} + constexpr uint32_t GraphicsAllocation::objectNotUsed; constexpr uint32_t GraphicsAllocation::objectNotResident; constexpr uint32_t GraphicsAllocation::objectAlwaysResident; diff --git a/shared/source/memory_manager/graphics_allocation.h b/shared/source/memory_manager/graphics_allocation.h index 02fe659f5b..394f1499b1 100644 --- a/shared/source/memory_manager/graphics_allocation.h +++ b/shared/source/memory_manager/graphics_allocation.h @@ -43,6 +43,7 @@ constexpr auto nonSharedResource = 0u; class Gmm; class MemoryManager; +class CommandStreamReceiver; struct AubInfo { uint32_t aubWritable = std::numeric_limits::max(); @@ -155,8 +156,8 @@ class GraphicsAllocation : public IDNode { bool isUsed() const { return registeredContextsNum > 0; } bool isUsedByManyOsContexts() const { return registeredContextsNum > 1u; } bool isUsedByOsContext(uint32_t contextId) const { return objectNotUsed != getTaskCount(contextId); } - void updateTaskCount(uint32_t newTaskCount, uint32_t contextId); - uint32_t getTaskCount(uint32_t contextId) const { return usageInfos[contextId].taskCount; } + MOCKABLE_VIRTUAL void updateTaskCount(uint32_t newTaskCount, uint32_t contextId); + MOCKABLE_VIRTUAL uint32_t getTaskCount(uint32_t contextId) const { return usageInfos[contextId].taskCount; } void releaseUsageInOsContext(uint32_t contextId) { updateTaskCount(objectNotUsed, contextId); } uint32_t getInspectionId(uint32_t contextId) const { return usageInfos[contextId].inspectionId; } void setInspectionId(uint32_t newInspectionId, uint32_t contextId) { usageInfos[contextId].inspectionId = newInspectionId; } @@ -215,6 +216,8 @@ class GraphicsAllocation : public IDNode { this->reservedAddressRangeInfo.rangeSize = size; } + void prepareHostPtrForResidency(CommandStreamReceiver *csr); + Gmm *getDefaultGmm() const { return getGmm(0u); } @@ -252,6 +255,7 @@ class GraphicsAllocation : public IDNode { constexpr static uint32_t objectNotResident = std::numeric_limits::max(); constexpr static uint32_t objectNotUsed = std::numeric_limits::max(); constexpr static uint32_t objectAlwaysResident = std::numeric_limits::max() - 1; + std::atomic hostPtrTaskCountAssignment{0}; protected: struct UsageInfo { diff --git a/shared/source/memory_manager/internal_allocation_storage.cpp b/shared/source/memory_manager/internal_allocation_storage.cpp index 66503e0565..26434fca75 100644 --- a/shared/source/memory_manager/internal_allocation_storage.cpp +++ b/shared/source/memory_manager/internal_allocation_storage.cpp @@ -52,7 +52,7 @@ void InternalAllocationStorage::freeAllocationsList(uint32_t waitTaskCount, Allo IDList allocationsLeft; while (curr != nullptr) { auto *next = curr->next; - if (curr->getTaskCount(commandStreamReceiver.getOsContext().getContextId()) <= waitTaskCount) { + if (curr->getTaskCount(commandStreamReceiver.getOsContext().getContextId()) <= waitTaskCount && curr->hostPtrTaskCountAssignment == 0) { memoryManager->freeGraphicsMemory(curr); } else { allocationsLeft.pushTailOne(*curr); diff --git a/shared/source/memory_manager/surface.h b/shared/source/memory_manager/surface.h index 0d159c1e76..ee2203cdcb 100644 --- a/shared/source/memory_manager/surface.h +++ b/shared/source/memory_manager/surface.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2021 Intel Corporation + * Copyright (C) 2020-2022 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -9,6 +9,7 @@ #include "shared/source/command_stream/command_stream_receiver.h" #include "shared/source/helpers/cache_policy.h" #include "shared/source/memory_manager/graphics_allocation.h" +#include "shared/source/os_interface/os_context.h" namespace NEO { @@ -49,6 +50,7 @@ class HostPtrSurface : public Surface { void makeResident(CommandStreamReceiver &csr) override { DEBUG_BREAK_IF(!gfxAllocation); + gfxAllocation->prepareHostPtrForResidency(&csr); csr.makeResidentHostPtrAllocation(gfxAllocation); } Surface *duplicate() override { diff --git a/shared/test/common/libult/ult_command_stream_receiver.h b/shared/test/common/libult/ult_command_stream_receiver.h index bd58f10a11..bb8dc0498d 100644 --- a/shared/test/common/libult/ult_command_stream_receiver.h +++ b/shared/test/common/libult/ult_command_stream_receiver.h @@ -318,7 +318,10 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw, publ bool createAllocationForHostSurface(HostPtrSurface &surface, bool requiresL3Flush) override { createAllocationForHostSurfaceCalled++; cpuCopyForHostPtrSurfaceAllowed = surface.peekIsPtrCopyAllowed(); - return BaseClass::createAllocationForHostSurface(surface, requiresL3Flush); + auto status = BaseClass::createAllocationForHostSurface(surface, requiresL3Flush); + if (status) + surface.getAllocation()->hostPtrTaskCountAssignment--; + return status; } void ensureCommandBufferAllocation(LinearStream &commandStream, size_t minimumRequiredSize, size_t additionalAllocationSize) override { diff --git a/shared/test/common/mocks/mock_command_stream_receiver.h b/shared/test/common/mocks/mock_command_stream_receiver.h index 14d3d20594..f71ae1ecbc 100644 --- a/shared/test/common/mocks/mock_command_stream_receiver.h +++ b/shared/test/common/mocks/mock_command_stream_receiver.h @@ -15,6 +15,7 @@ #include "shared/source/helpers/hw_info.h" #include "shared/source/helpers/string.h" #include "shared/source/memory_manager/graphics_allocation.h" +#include "shared/source/memory_manager/surface.h" #include "shared/source/os_interface/os_context.h" #include "shared/test/common/helpers/dispatch_flags_helper.h" @@ -151,7 +152,12 @@ class MockCommandStreamReceiver : public CommandStreamReceiver { ++hostPtrSurfaceCreationMutexLockCount; return CommandStreamReceiver::obtainHostPtrSurfaceCreationLock(); } - + bool createAllocationForHostSurface(HostPtrSurface &surface, bool requiresL3Flush) override { + bool status = CommandStreamReceiver::createAllocationForHostSurface(surface, requiresL3Flush); + if (status) + surface.getAllocation()->hostPtrTaskCountAssignment--; + return status; + } void postInitFlagsSetup() override {} static constexpr size_t tagSize = 256; diff --git a/shared/test/common/mocks/mock_graphics_allocation.h b/shared/test/common/mocks/mock_graphics_allocation.h index 39eaf52e49..57c7abb277 100644 --- a/shared/test/common/mocks/mock_graphics_allocation.h +++ b/shared/test/common/mocks/mock_graphics_allocation.h @@ -51,6 +51,20 @@ class MockGraphicsAllocation : public MemoryAllocation { } }; +class MockGraphicsAllocationTaskCount : public MockGraphicsAllocation { + public: + uint32_t getTaskCount(uint32_t contextId) const override { + getTaskCountCalleedTimes++; + return MockGraphicsAllocation::getTaskCount(contextId); + } + void updateTaskCount(uint32_t newTaskCount, uint32_t contextId) override { + updateTaskCountCalleedTimes++; + MockGraphicsAllocation::updateTaskCount(newTaskCount, contextId); + } + static uint32_t getTaskCountCalleedTimes; + uint32_t updateTaskCountCalleedTimes = 0; +}; + namespace GraphicsAllocationHelper { static inline MultiGraphicsAllocation toMultiGraphicsAllocation(GraphicsAllocation *graphicsAllocation) { diff --git a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp index a3d4ac3a6c..28dbdd7e9d 100644 --- a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp +++ b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp @@ -1502,6 +1502,50 @@ TEST_F(CreateAllocationForHostSurfaceTest, givenTemporaryAllocationWhenCreateAll EXPECT_EQ(allocationPtr, hostSurfaceAllocationPtr); } +class MockCommandStreamReceiverHostPtrCreate : public MockCommandStreamReceiver { + public: + MockCommandStreamReceiverHostPtrCreate(ExecutionEnvironment &executionEnvironment, uint32_t rootDeviceIndex, const DeviceBitfield deviceBitfield) + : MockCommandStreamReceiver(executionEnvironment, rootDeviceIndex, deviceBitfield) {} + bool createAllocationForHostSurface(HostPtrSurface &surface, bool requiresL3Flush) override { + return CommandStreamReceiver::createAllocationForHostSurface(surface, requiresL3Flush); + } +}; +TEST_F(CreateAllocationForHostSurfaceTest, givenTemporaryAllocationWhenCreateAllocationForHostSurfaceThenHostPtrTaskCountAssignmentWillIncrease) { + auto mockCsr = std::make_unique(executionEnvironment, 0u, device->getDeviceBitfield()); + mockCsr->internalAllocationStorage = std::make_unique(*mockCsr.get()); + mockCsr->osContext = &commandStreamReceiver->getOsContext(); + auto hostPtr = reinterpret_cast(0x1234); + size_t size = 100; + auto temporaryAllocation = std::make_unique(0, + AllocationType::EXTERNAL_HOST_PTR, hostPtr, size, 0, MemoryPool::System4KBPages, MemoryManager::maxOsContextCount); + auto allocationPtr = temporaryAllocation.get(); + temporaryAllocation->updateTaskCount(0u, 0u); + mockCsr->getInternalAllocationStorage()->storeAllocation(std::move(temporaryAllocation), TEMPORARY_ALLOCATION); + *mockCsr->getTagAddress() = 1u; + HostPtrSurface hostSurface(hostPtr, size); + + uint32_t valueBefore = allocationPtr->hostPtrTaskCountAssignment; + mockCsr->createAllocationForHostSurface(hostSurface, false); + EXPECT_EQ(valueBefore + 1, hostSurface.getAllocation()->hostPtrTaskCountAssignment); + allocationPtr->hostPtrTaskCountAssignment--; +} + +TEST_F(CreateAllocationForHostSurfaceTest, givenTemporaryAllocationWhenCreateAllocationForHostSurfaceThenAllocTaskCountEqualZero) { + auto hostPtr = reinterpret_cast(0x1234); + size_t size = 100; + auto temporaryAllocation = std::make_unique(0, + AllocationType::EXTERNAL_HOST_PTR, hostPtr, size, 0, MemoryPool::System4KBPages, MemoryManager::maxOsContextCount); + auto allocationPtr = temporaryAllocation.get(); + temporaryAllocation->updateTaskCount(10u, 0u); + commandStreamReceiver->getInternalAllocationStorage()->storeAllocation(std::move(temporaryAllocation), TEMPORARY_ALLOCATION); + *commandStreamReceiver->getTagAddress() = 1u; + HostPtrSurface hostSurface(hostPtr, size); + + EXPECT_EQ(allocationPtr->getTaskCount(0u), 10u); + commandStreamReceiver->createAllocationForHostSurface(hostSurface, false); + EXPECT_EQ(allocationPtr->getTaskCount(0u), 0u); +} + TEST_F(CreateAllocationForHostSurfaceTest, whenCreatingAllocationFromHostPtrSurfaceThenLockMutex) { const char memory[8] = {1, 2, 3, 4, 5, 6, 7, 8}; size_t size = sizeof(memory); diff --git a/shared/test/unit_test/memory_manager/graphics_allocation_tests.cpp b/shared/test/unit_test/memory_manager/graphics_allocation_tests.cpp index bf6f9f0af6..c542479fb1 100644 --- a/shared/test/unit_test/memory_manager/graphics_allocation_tests.cpp +++ b/shared/test/unit_test/memory_manager/graphics_allocation_tests.cpp @@ -6,6 +6,7 @@ */ #include "shared/test/common/mocks/mock_aub_csr.h" +#include "shared/test/common/mocks/mock_command_stream_receiver.h" #include "shared/test/common/mocks/mock_execution_environment.h" #include "shared/test/common/mocks/mock_graphics_allocation.h" #include "shared/test/common/test_macros/test.h" @@ -294,7 +295,7 @@ HWTEST_F(GraphicsAllocationTests, givenGraphicsAllocationThatHasPageTablesClonin EXPECT_TRUE(aubCsr.isAubWritable(graphicsAllocation)); - //modify non default bank + // modify non default bank graphicsAllocation.setAubWritable(false, 0x2); EXPECT_TRUE(aubCsr.isAubWritable(graphicsAllocation)); @@ -399,3 +400,68 @@ HWTEST_F(GraphicsAllocationTests, givenMultiStorageGraphicsAllocationWhenTbxWrit EXPECT_TRUE(graphicsAllocation.isTbxWritable(0b101)); EXPECT_TRUE(graphicsAllocation.isTbxWritable(0b1010)); } + +uint32_t MockGraphicsAllocationTaskCount::getTaskCountCalleedTimes = 0; + +TEST(GraphicsAllocationTest, givenGraphicsAllocationWhenAssignedTaskCountEqualZeroThenPrepareForResidencyDoeNotCallGetTaskCount) { + MockGraphicsAllocationTaskCount::getTaskCountCalleedTimes = 0; + MockGraphicsAllocationTaskCount graphicsAllocation; + graphicsAllocation.hostPtrTaskCountAssignment = 0; + graphicsAllocation.prepareHostPtrForResidency(nullptr); + EXPECT_EQ(MockGraphicsAllocationTaskCount::getTaskCountCalleedTimes, 0u); + MockGraphicsAllocationTaskCount::getTaskCountCalleedTimes = 0; +} + +HWTEST_F(GraphicsAllocationTests, givenGraphicsAllocationWhenAssignedTaskCountAbovelZeroThenPrepareForResidencyGetTaskCountWasCalled) { + executionEnvironment.initializeMemoryManager(); + auto osContext = std::unique_ptr(OsContext::create(nullptr, 0, EngineDescriptorHelper::getDefaultDescriptor())); + MockCommandStreamReceiver csr(executionEnvironment, 0, 1); + csr.osContext = osContext.get(); + MockGraphicsAllocationTaskCount::getTaskCountCalleedTimes = 0; + MockGraphicsAllocationTaskCount graphicsAllocation; + graphicsAllocation.hostPtrTaskCountAssignment = 1; + graphicsAllocation.prepareHostPtrForResidency(&csr); + EXPECT_EQ(MockGraphicsAllocationTaskCount::getTaskCountCalleedTimes, 1u); + MockGraphicsAllocationTaskCount::getTaskCountCalleedTimes = 0; +} + +HWTEST_F(GraphicsAllocationTests, givenGraphicsAllocationAllocTaskCountHigherThanInCsrThenUpdateTaskCountWasNotCalled) { + executionEnvironment.initializeMemoryManager(); + auto osContext = std::unique_ptr(OsContext::create(nullptr, 0, EngineDescriptorHelper::getDefaultDescriptor())); + MockCommandStreamReceiver csr(executionEnvironment, 0, 1); + csr.osContext = osContext.get(); + MockGraphicsAllocationTaskCount graphicsAllocation; + graphicsAllocation.updateTaskCount(10u, 0u); + *csr.getTagAddress() = 5; + graphicsAllocation.hostPtrTaskCountAssignment = 1; + auto calledTimesBefore = graphicsAllocation.updateTaskCountCalleedTimes; + graphicsAllocation.prepareHostPtrForResidency(&csr); + EXPECT_EQ(graphicsAllocation.updateTaskCountCalleedTimes, calledTimesBefore); +} + +HWTEST_F(GraphicsAllocationTests, givenGraphicsAllocationAllocTaskCountLowerThanInCsrThenUpdateTaskCountWasCalled) { + executionEnvironment.initializeMemoryManager(); + auto osContext = std::unique_ptr(OsContext::create(nullptr, 0, EngineDescriptorHelper::getDefaultDescriptor())); + MockCommandStreamReceiver csr(executionEnvironment, 0, 1); + csr.osContext = osContext.get(); + MockGraphicsAllocationTaskCount graphicsAllocation; + graphicsAllocation.updateTaskCount(5u, 0u); + *csr.getTagAddress() = 10; + graphicsAllocation.hostPtrTaskCountAssignment = 1; + auto calledTimesBefore = graphicsAllocation.updateTaskCountCalleedTimes; + graphicsAllocation.prepareHostPtrForResidency(&csr); + EXPECT_EQ(graphicsAllocation.updateTaskCountCalleedTimes, calledTimesBefore + 1u); +} + +HWTEST_F(GraphicsAllocationTests, givenGraphicsAllocationAllocTaskCountLowerThanInCsrThenAssignmentCountIsDecremented) { + executionEnvironment.initializeMemoryManager(); + auto osContext = std::unique_ptr(OsContext::create(nullptr, 0, EngineDescriptorHelper::getDefaultDescriptor())); + MockCommandStreamReceiver csr(executionEnvironment, 0, 1); + csr.osContext = osContext.get(); + MockGraphicsAllocationTaskCount graphicsAllocation; + graphicsAllocation.updateTaskCount(5u, 0u); + *csr.getTagAddress() = 10; + graphicsAllocation.hostPtrTaskCountAssignment = 1; + graphicsAllocation.prepareHostPtrForResidency(&csr); + EXPECT_EQ(graphicsAllocation.hostPtrTaskCountAssignment, 0u); +} \ No newline at end of file