performance(ocl): increase buffer pool size

increase pool size to 2MB and threshold to 1MB
add limit to the number of pools, set to 2

Related-To: NEO-9690

Signed-off-by: Dominik Dabek <dominik.dabek@intel.com>
This commit is contained in:
Dominik Dabek
2023-12-21 15:44:11 +00:00
committed by Compute-Runtime-Automation
parent 2d15517fbe
commit 6e434e0424
6 changed files with 72 additions and 26 deletions

View File

@@ -101,7 +101,7 @@ struct CopyEngineXeHPAndLater : public MulticontextAubFixture, public ::testing:
EXPECT_EQ(CL_SUCCESS, retVal);
if (compressed) {
EXPECT_TRUE(graphicsAllocation->getDefaultGmm()->isCompressionEnabled);
EXPECT_TRUE(graphicsAllocation->isCompressionEnabled());
}
EXPECT_EQ(!inLocalMemory, MemoryPoolHelper::isSystemMemoryPool(graphicsAllocation->getMemoryPool()));
@@ -113,7 +113,7 @@ struct CopyEngineXeHPAndLater : public MulticontextAubFixture, public ::testing:
}
uint64_t getGpuVA(Buffer &buffer) {
return buffer.getGraphicsAllocation(this->rootDeviceIndex)->getGpuAddress();
return ptrOffset(buffer.getGraphicsAllocation(this->rootDeviceIndex)->getGpuAddress(), buffer.getOffset());
}
void executeBlitCommand(const BlitProperties &blitProperties, bool blocking) {
@@ -160,8 +160,8 @@ struct CopyEngineXeHPAndLater : public MulticontextAubFixture, public ::testing:
CommandStreamReceiver *bcsCsr = nullptr;
TimestampPacketContainer timestampPacketContainer;
CsrDependencies csrDependencies;
const size_t bufferSize = MemoryConstants::pageSize64k + BlitterConstants::maxBlitWidth + 3;
size_t offset = (bufferSize / 4) - 3;
static constexpr size_t bufferSize = MemoryConstants::pageSize64k + BlitterConstants::maxBlitWidth + 3;
static constexpr size_t offset = (bufferSize / 4) - 3;
aub_stream::EngineType bcsEngineType = aub_stream::EngineType::ENGINE_BCS;
std::unique_ptr<uint8_t[]> compressiblePattern;
@@ -185,17 +185,17 @@ void CopyEngineXeHPAndLater<numTiles, testLocalMemory>::givenNotCompressedBuffer
// Buffer to Buffer - uncompressed HBM -> compressed HBM
auto blitProperties = BlitProperties::constructPropertiesForCopy(dstCompressedBuffer->getGraphicsAllocation(rootDeviceIndex),
srcNotCompressedBuffer->getGraphicsAllocation(rootDeviceIndex),
0, 0, {bufferSize, 1, 1}, 0, 0, 0, 0, bcsCsr->getClearColorAllocation());
{dstCompressedBuffer->getOffset(), 0, 0}, {srcNotCompressedBuffer->getOffset(), 0, 0}, {bufferSize, 1, 1}, 0, 0, 0, 0, bcsCsr->getClearColorAllocation());
executeBlitCommand(blitProperties, true);
// Buffer to Buffer - uncompressed HBM -> uncompressed HBM
blitProperties = BlitProperties::constructPropertiesForCopy(dstNotCompressedBuffer->getGraphicsAllocation(rootDeviceIndex),
srcNotCompressedBuffer->getGraphicsAllocation(rootDeviceIndex),
0, 0, {bufferSize, 1, 1}, 0, 0, 0, 0, bcsCsr->getClearColorAllocation());
{dstNotCompressedBuffer->getOffset(), 0, 0}, {srcNotCompressedBuffer->getOffset(), 0, 0}, {bufferSize, 1, 1}, 0, 0, 0, 0, bcsCsr->getClearColorAllocation());
executeBlitCommand(blitProperties, true);
// Buffer to Buffer - compressed HBM -> uncompressed HBM
blitProperties = BlitProperties::constructPropertiesForCopy(dstResolvedBuffer->getGraphicsAllocation(rootDeviceIndex),
dstCompressedBuffer->getGraphicsAllocation(rootDeviceIndex),
0, 0, {bufferSize, 1, 1}, 0, 0, 0, 0, bcsCsr->getClearColorAllocation());
{dstResolvedBuffer->getOffset(), 0, 0}, {dstCompressedBuffer->getOffset(), 0, 0}, {bufferSize, 1, 1}, 0, 0, 0, 0, bcsCsr->getClearColorAllocation());
executeBlitCommand(blitProperties, true);
blitProperties = BlitProperties::constructPropertiesForReadWrite(BlitterConstants::BlitDirection::bufferToHostPtr, *bcsCsr,
@@ -354,7 +354,7 @@ void CopyEngineXeHPAndLater<numTiles, testLocalMemory>::givenOffsetsWhenBltExecu
auto blitProperties = BlitProperties::constructPropertiesForCopy(dstBuffer->getGraphicsAllocation(rootDeviceIndex),
srcBuffer->getGraphicsAllocation(rootDeviceIndex),
{offset, 0, 0}, 0, {copiedSize, 1, 1}, 0, 0, 0, 0, bcsCsr->getClearColorAllocation());
{offset + dstBuffer->getOffset(), 0, 0}, {srcBuffer->getOffset(), 0, 0}, {copiedSize, 1, 1}, 0, 0, 0, 0, bcsCsr->getClearColorAllocation());
executeBlitCommand(blitProperties, true);
@@ -584,11 +584,11 @@ void CopyEngineXeHPAndLater<numTiles, testLocalMemory>::givenCopyBufferRectWithO
auto srcBuffer = createBuffer(false, testLocalMemory, srcMemory.get());
auto dstBuffer = createBuffer(false, testLocalMemory, destMemory.get());
auto pSrcMemory = &srcMemory[0];
auto pDestMemory = reinterpret_cast<uint8_t *>(getGpuAddress(*dstBuffer));
auto pDestMemory = reinterpret_cast<uint8_t *>(getGpuAddress((*dstBuffer)));
auto clearColorAllocation = bcsCsr->getClearColorAllocation();
size_t srcOrigin[] = {0, 0, 0};
size_t dstOrigin[] = {1 * sizeof(uint8_t), 0, 0};
size_t srcOrigin[] = {srcBuffer->getOffset(), 0, 0};
size_t dstOrigin[] = {1 * sizeof(uint8_t) + dstBuffer->getOffset(), 0, 0};
size_t region[] = {2 * sizeof(uint8_t), 2, 2};
size_t srcRowPitch = region[0];
size_t srcSlicePitch = srcRowPitch * region[1];
@@ -613,8 +613,8 @@ void CopyEngineXeHPAndLater<numTiles, testLocalMemory>::givenCopyBufferRectWithO
pSrcMemory = ptrOffset(pSrcMemory, 0);
expectMemoryNotEqual<FamilyType>(ptrOffset(pDestMemory, dstOrigin[0]), pSrcMemory, copySize + 1, 0, 0);
expectMemory<FamilyType>(ptrOffset(pDestMemory, dstOrigin[0]), pSrcMemory, copySize, 0, 0);
expectMemoryNotEqual<FamilyType>(ptrOffset(pDestMemory, sizeof(uint8_t)), pSrcMemory, copySize + 1, 0, 0);
expectMemory<FamilyType>(ptrOffset(pDestMemory, sizeof(uint8_t)), pSrcMemory, copySize, 0, 0);
}
template <uint32_t numTiles, bool testLocalMemory>
@@ -638,8 +638,8 @@ void CopyEngineXeHPAndLater<numTiles, testLocalMemory>::givenCopyBufferRectWithB
auto pDestMemory = reinterpret_cast<uint8_t *>(getGpuAddress(*dstBuffer));
auto clearColorAllocation = bcsCsr->getClearColorAllocation();
size_t srcOrigin[] = {0, 0, 0};
size_t dstOrigin[] = {1, 1, 1};
size_t srcOrigin[] = {srcBuffer->getOffset(), 0, 0};
size_t dstOrigin[] = {1 + dstBuffer->getOffset(), 1, 1};
size_t region[] = {20, 16, 2};
size_t srcRowPitch = region[0];
size_t srcSlicePitch = srcRowPitch * region[1];
@@ -660,7 +660,7 @@ void CopyEngineXeHPAndLater<numTiles, testLocalMemory>::givenCopyBufferRectWithB
executeBlitCommand(blitProperties, false);
bcsCsr->waitForTaskCountWithKmdNotifyFallback(0, 0, false, QueueThrottle::MEDIUM);
size_t dstOffset = dstOrigin[0] + dstOrigin[1] * dstRowPitch + dstOrigin[2] * dstSlicePitch;
size_t dstOffset = 1 + dstOrigin[1] * dstRowPitch + dstOrigin[2] * dstSlicePitch;
expectMemoryNotEqual<FamilyType>(ptrOffset(pDestMemory, dstOffset), pSrcMemory, copySize + 1, 0, 0);
expectMemory<FamilyType>(ptrOffset(pDestMemory, dstOffset), pSrcMemory, copySize, 0, 0);

View File

@@ -207,6 +207,19 @@ TEST_F(AggregatedSmallBuffersEnabledTest, givenAggregatedSmallBuffersEnabledAndS
std::unique_ptr<Buffer> buffer(Buffer::create(context.get(), flags, size, hostPtr, retVal));
EXPECT_NE(nullptr, buffer);
EXPECT_EQ(CL_SUCCESS, retVal);
EXPECT_EQ(0u, poolAllocator->bufferPools[0].chunkAllocator->getUsedSize());
}
TEST_F(AggregatedSmallBuffersEnabledTest, givenAggregatedSmallBuffersEnabledAndFlagCompressedPreferredWhenBufferCreateCalledThenDoNotUsePool) {
EXPECT_TRUE(poolAllocator->isAggregatedSmallBuffersEnabled(context.get()));
EXPECT_EQ(1u, poolAllocator->bufferPools.size());
EXPECT_NE(nullptr, poolAllocator->bufferPools[0].mainStorage.get());
size = PoolAllocator::smallBufferThreshold;
flags |= CL_MEM_COMPRESSED_HINT_INTEL;
std::unique_ptr<Buffer> buffer(Buffer::create(context.get(), flags, size, hostPtr, retVal));
EXPECT_NE(nullptr, buffer);
EXPECT_EQ(CL_SUCCESS, retVal);
EXPECT_EQ(0u, poolAllocator->bufferPools[0].chunkAllocator->getUsedSize());
}
TEST_F(AggregatedSmallBuffersEnabledTest, givenAggregatedSmallBuffersEnabledAndSizeLowerThenChunkAlignmentWhenBufferCreatedAndDestroyedThenSizeIsAsRequestedAndCorrectSizeIsNotFreed) {
@@ -344,6 +357,32 @@ TEST_F(AggregatedSmallBuffersEnabledTest, givenAggregatedSmallBuffersEnabledAndB
EXPECT_EQ(size, poolAllocator->bufferPools[1].chunkAllocator->getUsedSize());
}
TEST_F(AggregatedSmallBuffersEnabledTest, givenAggregatedSmallBuffersEnabledAndBufferPoolIsExhaustedAndAllocationsAreInUseAndPoolLimitIsReachedThenNewPoolIsNotCreated) {
EXPECT_TRUE(poolAllocator->isAggregatedSmallBuffersEnabled(context.get()));
EXPECT_EQ(1u, poolAllocator->bufferPools.size());
EXPECT_NE(nullptr, poolAllocator->bufferPools[0].mainStorage.get());
constexpr auto buffersToCreate = (PoolAllocator::aggregatedSmallBuffersPoolSize / PoolAllocator::smallBufferThreshold) * PoolAllocator::maxPoolCount;
std::vector<std::unique_ptr<Buffer>> buffers(buffersToCreate);
for (auto i = 0u; i < buffersToCreate; ++i) {
buffers[i].reset(Buffer::create(context.get(), flags, size, hostPtr, retVal));
EXPECT_EQ(retVal, CL_SUCCESS);
}
EXPECT_EQ(PoolAllocator::maxPoolCount, poolAllocator->bufferPools.size());
for (auto i = 0u; i < PoolAllocator::maxPoolCount; ++i) {
EXPECT_EQ(PoolAllocator::aggregatedSmallBuffersPoolSize, poolAllocator->bufferPools[i].chunkAllocator->getUsedSize());
}
EXPECT_EQ(1u, mockMemoryManager->allocInUseCalled);
mockMemoryManager->deferAllocInUse = true;
mockMemoryManager->failInDevicePoolWithError = true;
std::unique_ptr<Buffer> bufferAfterExhaustMustFail(Buffer::create(context.get(), flags, size, hostPtr, retVal));
EXPECT_EQ(nullptr, bufferAfterExhaustMustFail.get());
EXPECT_NE(retVal, CL_SUCCESS);
EXPECT_EQ(PoolAllocator::maxPoolCount, poolAllocator->bufferPools.size());
EXPECT_EQ(3u, mockMemoryManager->allocInUseCalled);
}
TEST_F(AggregatedSmallBuffersEnabledTest, givenCopyHostPointerWhenCreatingBufferButCopyFailedThenDoNotUsePool) {
class MockCommandQueueFailFirstEnqueueWrite : public MockCommandQueue {
public: