diff --git a/opencl/source/context/context.cpp b/opencl/source/context/context.cpp index cf6e44cd17..043a0244de 100644 --- a/opencl/source/context/context.cpp +++ b/opencl/source/context/context.cpp @@ -624,8 +624,11 @@ Buffer *Context::BufferPoolAllocator::allocateBufferFromPool(const MemoryPropert return bufferFromPool; } - this->addNewBufferPool(BufferPool{this->context}); - return this->allocateFromPools(memoryProperties, flags, flagsIntel, requestedSize, hostPtr, errcodeRet); + if (this->bufferPools.size() < BufferPoolAllocator::maxPoolCount) { + this->addNewBufferPool(BufferPool{this->context}); + return this->allocateFromPools(memoryProperties, flags, flagsIntel, requestedSize, hostPtr, errcodeRet); + } + return nullptr; } Buffer *Context::BufferPoolAllocator::allocateFromPools(const MemoryProperties &memoryProperties, diff --git a/opencl/source/context/pool_buffer_additional_checks.cpp b/opencl/source/context/pool_buffer_additional_checks.cpp index 6a6b9d7e92..148b8e2fe3 100644 --- a/opencl/source/context/pool_buffer_additional_checks.cpp +++ b/opencl/source/context/pool_buffer_additional_checks.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2022 Intel Corporation + * Copyright (C) 2022-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -9,7 +9,8 @@ namespace NEO { bool Context::BufferPoolAllocator::flagsAllowBufferFromPool(const cl_mem_flags &flags, const cl_mem_flags_intel &flagsIntel) const { - return true; + return (flagsIntel & CL_MEM_COMPRESSED_HINT_INTEL) == false && + (flags & CL_MEM_COMPRESSED_HINT_INTEL) == false; } } // namespace NEO \ No newline at end of file diff --git a/opencl/source/mem_obj/buffer.cpp b/opencl/source/mem_obj/buffer.cpp index 8f69d2be1b..1b7389e4a7 100644 --- a/opencl/source/mem_obj/buffer.cpp +++ b/opencl/source/mem_obj/buffer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -214,9 +214,9 @@ bool inline copyHostPointer(Buffer *buffer, copyOnCpuAllowed = debugManager.flags.CopyHostPtrOnCpu.get() == 1; } if (auto lockedPointer = copyOnCpuAllowed ? device.getMemoryManager()->lockResource(memory) : nullptr) { - memcpy_s(ptrOffset(lockedPointer, buffer->getOffset()), size, hostPtr, size); memory->setAubWritable(true, GraphicsAllocation::defaultBank); memory->setTbxWritable(true, GraphicsAllocation::defaultBank); + memcpy_s(ptrOffset(lockedPointer, buffer->getOffset()), size, hostPtr, size); return true; } else { auto blitMemoryToAllocationResult = BlitOperationResult::unsupported; diff --git a/opencl/test/unit_test/aub_tests/command_stream/copy_engine_aub_tests_xehp_and_later.h b/opencl/test/unit_test/aub_tests/command_stream/copy_engine_aub_tests_xehp_and_later.h index 88694333c5..316fdc6d42 100644 --- a/opencl/test/unit_test/aub_tests/command_stream/copy_engine_aub_tests_xehp_and_later.h +++ b/opencl/test/unit_test/aub_tests/command_stream/copy_engine_aub_tests_xehp_and_later.h @@ -101,7 +101,7 @@ struct CopyEngineXeHPAndLater : public MulticontextAubFixture, public ::testing: EXPECT_EQ(CL_SUCCESS, retVal); if (compressed) { - EXPECT_TRUE(graphicsAllocation->getDefaultGmm()->isCompressionEnabled); + EXPECT_TRUE(graphicsAllocation->isCompressionEnabled()); } EXPECT_EQ(!inLocalMemory, MemoryPoolHelper::isSystemMemoryPool(graphicsAllocation->getMemoryPool())); @@ -113,7 +113,7 @@ struct CopyEngineXeHPAndLater : public MulticontextAubFixture, public ::testing: } uint64_t getGpuVA(Buffer &buffer) { - return buffer.getGraphicsAllocation(this->rootDeviceIndex)->getGpuAddress(); + return ptrOffset(buffer.getGraphicsAllocation(this->rootDeviceIndex)->getGpuAddress(), buffer.getOffset()); } void executeBlitCommand(const BlitProperties &blitProperties, bool blocking) { @@ -160,8 +160,8 @@ struct CopyEngineXeHPAndLater : public MulticontextAubFixture, public ::testing: CommandStreamReceiver *bcsCsr = nullptr; TimestampPacketContainer timestampPacketContainer; CsrDependencies csrDependencies; - const size_t bufferSize = MemoryConstants::pageSize64k + BlitterConstants::maxBlitWidth + 3; - size_t offset = (bufferSize / 4) - 3; + static constexpr size_t bufferSize = MemoryConstants::pageSize64k + BlitterConstants::maxBlitWidth + 3; + static constexpr size_t offset = (bufferSize / 4) - 3; aub_stream::EngineType bcsEngineType = aub_stream::EngineType::ENGINE_BCS; std::unique_ptr compressiblePattern; @@ -185,17 +185,17 @@ void CopyEngineXeHPAndLater::givenNotCompressedBuffer // Buffer to Buffer - uncompressed HBM -> compressed HBM auto blitProperties = BlitProperties::constructPropertiesForCopy(dstCompressedBuffer->getGraphicsAllocation(rootDeviceIndex), srcNotCompressedBuffer->getGraphicsAllocation(rootDeviceIndex), - 0, 0, {bufferSize, 1, 1}, 0, 0, 0, 0, bcsCsr->getClearColorAllocation()); + {dstCompressedBuffer->getOffset(), 0, 0}, {srcNotCompressedBuffer->getOffset(), 0, 0}, {bufferSize, 1, 1}, 0, 0, 0, 0, bcsCsr->getClearColorAllocation()); executeBlitCommand(blitProperties, true); // Buffer to Buffer - uncompressed HBM -> uncompressed HBM blitProperties = BlitProperties::constructPropertiesForCopy(dstNotCompressedBuffer->getGraphicsAllocation(rootDeviceIndex), srcNotCompressedBuffer->getGraphicsAllocation(rootDeviceIndex), - 0, 0, {bufferSize, 1, 1}, 0, 0, 0, 0, bcsCsr->getClearColorAllocation()); + {dstNotCompressedBuffer->getOffset(), 0, 0}, {srcNotCompressedBuffer->getOffset(), 0, 0}, {bufferSize, 1, 1}, 0, 0, 0, 0, bcsCsr->getClearColorAllocation()); executeBlitCommand(blitProperties, true); // Buffer to Buffer - compressed HBM -> uncompressed HBM blitProperties = BlitProperties::constructPropertiesForCopy(dstResolvedBuffer->getGraphicsAllocation(rootDeviceIndex), dstCompressedBuffer->getGraphicsAllocation(rootDeviceIndex), - 0, 0, {bufferSize, 1, 1}, 0, 0, 0, 0, bcsCsr->getClearColorAllocation()); + {dstResolvedBuffer->getOffset(), 0, 0}, {dstCompressedBuffer->getOffset(), 0, 0}, {bufferSize, 1, 1}, 0, 0, 0, 0, bcsCsr->getClearColorAllocation()); executeBlitCommand(blitProperties, true); blitProperties = BlitProperties::constructPropertiesForReadWrite(BlitterConstants::BlitDirection::bufferToHostPtr, *bcsCsr, @@ -354,7 +354,7 @@ void CopyEngineXeHPAndLater::givenOffsetsWhenBltExecu auto blitProperties = BlitProperties::constructPropertiesForCopy(dstBuffer->getGraphicsAllocation(rootDeviceIndex), srcBuffer->getGraphicsAllocation(rootDeviceIndex), - {offset, 0, 0}, 0, {copiedSize, 1, 1}, 0, 0, 0, 0, bcsCsr->getClearColorAllocation()); + {offset + dstBuffer->getOffset(), 0, 0}, {srcBuffer->getOffset(), 0, 0}, {copiedSize, 1, 1}, 0, 0, 0, 0, bcsCsr->getClearColorAllocation()); executeBlitCommand(blitProperties, true); @@ -584,11 +584,11 @@ void CopyEngineXeHPAndLater::givenCopyBufferRectWithO auto srcBuffer = createBuffer(false, testLocalMemory, srcMemory.get()); auto dstBuffer = createBuffer(false, testLocalMemory, destMemory.get()); auto pSrcMemory = &srcMemory[0]; - auto pDestMemory = reinterpret_cast(getGpuAddress(*dstBuffer)); + auto pDestMemory = reinterpret_cast(getGpuAddress((*dstBuffer))); auto clearColorAllocation = bcsCsr->getClearColorAllocation(); - size_t srcOrigin[] = {0, 0, 0}; - size_t dstOrigin[] = {1 * sizeof(uint8_t), 0, 0}; + size_t srcOrigin[] = {srcBuffer->getOffset(), 0, 0}; + size_t dstOrigin[] = {1 * sizeof(uint8_t) + dstBuffer->getOffset(), 0, 0}; size_t region[] = {2 * sizeof(uint8_t), 2, 2}; size_t srcRowPitch = region[0]; size_t srcSlicePitch = srcRowPitch * region[1]; @@ -613,8 +613,8 @@ void CopyEngineXeHPAndLater::givenCopyBufferRectWithO pSrcMemory = ptrOffset(pSrcMemory, 0); - expectMemoryNotEqual(ptrOffset(pDestMemory, dstOrigin[0]), pSrcMemory, copySize + 1, 0, 0); - expectMemory(ptrOffset(pDestMemory, dstOrigin[0]), pSrcMemory, copySize, 0, 0); + expectMemoryNotEqual(ptrOffset(pDestMemory, sizeof(uint8_t)), pSrcMemory, copySize + 1, 0, 0); + expectMemory(ptrOffset(pDestMemory, sizeof(uint8_t)), pSrcMemory, copySize, 0, 0); } template @@ -638,8 +638,8 @@ void CopyEngineXeHPAndLater::givenCopyBufferRectWithB auto pDestMemory = reinterpret_cast(getGpuAddress(*dstBuffer)); auto clearColorAllocation = bcsCsr->getClearColorAllocation(); - size_t srcOrigin[] = {0, 0, 0}; - size_t dstOrigin[] = {1, 1, 1}; + size_t srcOrigin[] = {srcBuffer->getOffset(), 0, 0}; + size_t dstOrigin[] = {1 + dstBuffer->getOffset(), 1, 1}; size_t region[] = {20, 16, 2}; size_t srcRowPitch = region[0]; size_t srcSlicePitch = srcRowPitch * region[1]; @@ -660,7 +660,7 @@ void CopyEngineXeHPAndLater::givenCopyBufferRectWithB executeBlitCommand(blitProperties, false); bcsCsr->waitForTaskCountWithKmdNotifyFallback(0, 0, false, QueueThrottle::MEDIUM); - size_t dstOffset = dstOrigin[0] + dstOrigin[1] * dstRowPitch + dstOrigin[2] * dstSlicePitch; + size_t dstOffset = 1 + dstOrigin[1] * dstRowPitch + dstOrigin[2] * dstSlicePitch; expectMemoryNotEqual(ptrOffset(pDestMemory, dstOffset), pSrcMemory, copySize + 1, 0, 0); expectMemory(ptrOffset(pDestMemory, dstOffset), pSrcMemory, copySize, 0, 0); diff --git a/opencl/test/unit_test/mem_obj/buffer_pool_alloc_tests.cpp b/opencl/test/unit_test/mem_obj/buffer_pool_alloc_tests.cpp index 483099b7da..c1c753bd36 100644 --- a/opencl/test/unit_test/mem_obj/buffer_pool_alloc_tests.cpp +++ b/opencl/test/unit_test/mem_obj/buffer_pool_alloc_tests.cpp @@ -207,6 +207,19 @@ TEST_F(AggregatedSmallBuffersEnabledTest, givenAggregatedSmallBuffersEnabledAndS std::unique_ptr buffer(Buffer::create(context.get(), flags, size, hostPtr, retVal)); EXPECT_NE(nullptr, buffer); EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(0u, poolAllocator->bufferPools[0].chunkAllocator->getUsedSize()); +} + +TEST_F(AggregatedSmallBuffersEnabledTest, givenAggregatedSmallBuffersEnabledAndFlagCompressedPreferredWhenBufferCreateCalledThenDoNotUsePool) { + EXPECT_TRUE(poolAllocator->isAggregatedSmallBuffersEnabled(context.get())); + EXPECT_EQ(1u, poolAllocator->bufferPools.size()); + EXPECT_NE(nullptr, poolAllocator->bufferPools[0].mainStorage.get()); + size = PoolAllocator::smallBufferThreshold; + flags |= CL_MEM_COMPRESSED_HINT_INTEL; + std::unique_ptr buffer(Buffer::create(context.get(), flags, size, hostPtr, retVal)); + EXPECT_NE(nullptr, buffer); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(0u, poolAllocator->bufferPools[0].chunkAllocator->getUsedSize()); } TEST_F(AggregatedSmallBuffersEnabledTest, givenAggregatedSmallBuffersEnabledAndSizeLowerThenChunkAlignmentWhenBufferCreatedAndDestroyedThenSizeIsAsRequestedAndCorrectSizeIsNotFreed) { @@ -344,6 +357,32 @@ TEST_F(AggregatedSmallBuffersEnabledTest, givenAggregatedSmallBuffersEnabledAndB EXPECT_EQ(size, poolAllocator->bufferPools[1].chunkAllocator->getUsedSize()); } +TEST_F(AggregatedSmallBuffersEnabledTest, givenAggregatedSmallBuffersEnabledAndBufferPoolIsExhaustedAndAllocationsAreInUseAndPoolLimitIsReachedThenNewPoolIsNotCreated) { + EXPECT_TRUE(poolAllocator->isAggregatedSmallBuffersEnabled(context.get())); + EXPECT_EQ(1u, poolAllocator->bufferPools.size()); + EXPECT_NE(nullptr, poolAllocator->bufferPools[0].mainStorage.get()); + + constexpr auto buffersToCreate = (PoolAllocator::aggregatedSmallBuffersPoolSize / PoolAllocator::smallBufferThreshold) * PoolAllocator::maxPoolCount; + std::vector> buffers(buffersToCreate); + for (auto i = 0u; i < buffersToCreate; ++i) { + buffers[i].reset(Buffer::create(context.get(), flags, size, hostPtr, retVal)); + EXPECT_EQ(retVal, CL_SUCCESS); + } + EXPECT_EQ(PoolAllocator::maxPoolCount, poolAllocator->bufferPools.size()); + for (auto i = 0u; i < PoolAllocator::maxPoolCount; ++i) { + EXPECT_EQ(PoolAllocator::aggregatedSmallBuffersPoolSize, poolAllocator->bufferPools[i].chunkAllocator->getUsedSize()); + } + EXPECT_EQ(1u, mockMemoryManager->allocInUseCalled); + mockMemoryManager->deferAllocInUse = true; + mockMemoryManager->failInDevicePoolWithError = true; + + std::unique_ptr bufferAfterExhaustMustFail(Buffer::create(context.get(), flags, size, hostPtr, retVal)); + EXPECT_EQ(nullptr, bufferAfterExhaustMustFail.get()); + EXPECT_NE(retVal, CL_SUCCESS); + EXPECT_EQ(PoolAllocator::maxPoolCount, poolAllocator->bufferPools.size()); + EXPECT_EQ(3u, mockMemoryManager->allocInUseCalled); +} + TEST_F(AggregatedSmallBuffersEnabledTest, givenCopyHostPointerWhenCreatingBufferButCopyFailedThenDoNotUsePool) { class MockCommandQueueFailFirstEnqueueWrite : public MockCommandQueue { public: diff --git a/shared/source/utilities/buffer_pool_allocator.h b/shared/source/utilities/buffer_pool_allocator.h index dfaf571b94..259871b466 100644 --- a/shared/source/utilities/buffer_pool_allocator.h +++ b/shared/source/utilities/buffer_pool_allocator.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2023 Intel Corporation + * Copyright (C) 2023-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -25,10 +25,11 @@ class MemoryManager; template struct SmallBuffersParams { protected: - static constexpr auto aggregatedSmallBuffersPoolSize = 64 * MemoryConstants::kiloByte; - static constexpr auto smallBufferThreshold = 4 * MemoryConstants::kiloByte; - static constexpr auto chunkAlignment = 512u; + static constexpr auto aggregatedSmallBuffersPoolSize = 2 * MemoryConstants::megaByte; + static constexpr auto smallBufferThreshold = 1 * MemoryConstants::megaByte; + static constexpr auto chunkAlignment = MemoryConstants::pageSize64k; static constexpr auto startingOffset = chunkAlignment; + static constexpr auto maxPoolCount = 2u; }; template @@ -41,6 +42,7 @@ struct AbstractBuffersPool : public SmallBuffersParams, public NonCopyabl using Params = SmallBuffersParams; using Params::aggregatedSmallBuffersPoolSize; using Params::chunkAlignment; + using Params::maxPoolCount; using Params::smallBufferThreshold; using Params::startingOffset; using AllocsVecCRef = const StackVec &; @@ -75,6 +77,7 @@ class AbstractBuffersAllocator : public SmallBuffersParams { using Params = SmallBuffersParams; using Params::aggregatedSmallBuffersPoolSize; using Params::chunkAlignment; + using Params::maxPoolCount; using Params::smallBufferThreshold; using Params::startingOffset; static_assert(aggregatedSmallBuffersPoolSize > smallBufferThreshold, "Largest allowed buffer needs to fit in pool");