performance: prealloc cmdbuffer on mtl

Preallocate 2 command buffers allocations per command queue initialized
on MTL.

Related-To: NEO-8152

Signed-off-by: Dominik Dabek <dominik.dabek@intel.com>
This commit is contained in:
Dominik Dabek
2023-11-08 12:44:01 +00:00
committed by Compute-Runtime-Automation
parent 79fbd8fedf
commit 7a6fc209dd
16 changed files with 107 additions and 23 deletions

View File

@@ -1142,6 +1142,8 @@ HWTEST_F(CommandQueueHwTest, givenCsrClientWhenCallingSyncPointsThenUnregister)
}
HWTEST_F(CommandQueueHwTest, givenKernelSplitEnqueueReadBufferWhenBlockedThenEnqueueSurfacesMakeResidentIsCalledOnce) {
DebugManagerStateRestore restorer;
DebugManager.flags.SetAmountOfReusableAllocationsPerCmdQueue.set(0);
UserEvent userEvent(context);
auto &csr = pDevice->getUltCommandStreamReceiver<FamilyType>();
csr.storeMakeResidentAllocations = true;

View File

@@ -499,6 +499,8 @@ TEST_F(CommandQueueCommandStreamTest, WhenGettingCommandStreamWithNewSizeThenMax
}
TEST_F(CommandQueueCommandStreamTest, givenCommandStreamReceiverWithReusableAllocationsWhenAskedForCommandStreamThenReturnsAllocationFromReusablePool) {
DebugManagerStateRestore restorer;
DebugManager.flags.SetAmountOfReusableAllocationsPerCmdQueue.set(0);
const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, 0, 0};
MockCommandQueue cmdQ(context.get(), pClDevice, props, false);
@@ -518,7 +520,9 @@ TEST_F(CommandQueueCommandStreamTest, givenCommandStreamReceiverWithReusableAllo
EXPECT_TRUE(commandStreamReceiver.getAllocationsForReuse().peekIsEmpty());
}
TEST_F(CommandQueueCommandStreamTest, givenCommandQueueWhenItIsDestroyedThenCommandStreamIsPutOnTheReusabeList) {
TEST_F(CommandQueueCommandStreamTest, givenCommandQueueWhenItIsDestroyedThenCommandStreamIsPutOnTheReusableList) {
DebugManagerStateRestore restorer;
DebugManager.flags.SetAmountOfReusableAllocationsPerCmdQueue.set(0);
auto cmdQ = new MockCommandQueue(context.get(), pClDevice, 0, false);
const auto &commandStream = cmdQ->getCS(100);
auto graphicsAllocation = commandStream.getGraphicsAllocation();
@@ -531,6 +535,8 @@ TEST_F(CommandQueueCommandStreamTest, givenCommandQueueWhenItIsDestroyedThenComm
}
TEST_F(CommandQueueCommandStreamTest, WhenAskedForNewCommandStreamThenOldHeapIsStoredForReuse) {
DebugManagerStateRestore restorer;
DebugManager.flags.SetAmountOfReusableAllocationsPerCmdQueue.set(0);
const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, 0, 0};
MockCommandQueue cmdQ(context.get(), pClDevice, props, false);
@@ -654,6 +660,8 @@ TEST_P(CommandQueueIndirectHeapTest, WhenGettingIndirectHeapThenSizeIsAlignedToC
}
HWTEST_P(CommandQueueIndirectHeapTest, givenCommandStreamReceiverWithReusableAllocationsWhenAskedForHeapAllocationThenAllocationFromReusablePoolIsReturned) {
DebugManagerStateRestore restorer;
DebugManager.flags.SetAmountOfReusableAllocationsPerCmdQueue.set(0);
const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, 0, 0};
MockCommandQueue cmdQ(context.get(), pClDevice, props, false);
@@ -739,12 +747,14 @@ TEST_P(CommandQueueIndirectHeapTest, GivenCommandQueueWithoutHeapAllocationWhenA
memoryManager->freeGraphicsMemory(graphicsAllocation);
}
TEST_P(CommandQueueIndirectHeapTest, givenCommandQueueWithResourceCachingActiveWhenQueueISDestroyedThenIndirectHeapIsNotOnReuseList) {
TEST_P(CommandQueueIndirectHeapTest, givenCommandQueueWithResourceCachingActiveWhenQueueIsDestroyedThenIndirectHeapIsNotOnReuseList) {
DebugManagerStateRestore restorer;
DebugManager.flags.SetAmountOfReusableAllocationsPerCmdQueue.set(0);
auto cmdQ = new MockCommandQueue(context.get(), pClDevice, 0, false);
cmdQ->getIndirectHeap(this->GetParam(), 100);
EXPECT_TRUE(pDevice->getDefaultEngine().commandStreamReceiver->getAllocationsForReuse().peekIsEmpty());
// now destroy command queue, heap should go to reusable list
// now destroy command queue, heap should NOT go to reusable list
delete cmdQ;
EXPECT_TRUE(pDevice->getDefaultEngine().commandStreamReceiver->getAllocationsForReuse().peekIsEmpty());
}
@@ -783,6 +793,8 @@ TEST_P(CommandQueueIndirectHeapTest, GivenCommandQueueWithoutHeapAllocatedWhenIn
}
TEST_P(CommandQueueIndirectHeapTest, GivenCommandQueueWithHeapWhenGraphicAllocationIsNullThenNothingOnReuseList) {
DebugManagerStateRestore restorer;
DebugManager.flags.SetAmountOfReusableAllocationsPerCmdQueue.set(0);
const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, 0, 0};
MockCommandQueue cmdQ(context.get(), pClDevice, props, false);

View File

@@ -816,6 +816,8 @@ HWTEST_F(DispatchWalkerTest, givenBlockedQueueWhenDispatchWalkerIsCalledThenComm
}
HWTEST_F(DispatchWalkerTest, givenThereAreAllocationsForReuseWhenDispatchWalkerIsCalledThenCommandStreamObtainsReusableAllocation) {
DebugManagerStateRestore restorer;
DebugManager.flags.SetAmountOfReusableAllocationsPerCmdQueue.set(0);
MockKernel kernel(program.get(), kernelInfo, *pClDevice);
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
MockMultiDispatchInfo multiDispatchInfo(pClDevice, &kernel);

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2018-2022 Intel Corporation
* Copyright (C) 2018-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*

View File

@@ -10,6 +10,7 @@
#include "shared/source/helpers/aligned_memory.h"
#include "shared/source/helpers/ptr_math.h"
#include "shared/source/memory_manager/allocations_list.h"
#include "shared/source/memory_manager/internal_allocation_storage.h"
#include "shared/source/memory_manager/memory_manager.h"
#include "shared/source/os_interface/os_context.h"
#include "shared/test/common/helpers/unit_test_helper.h"
@@ -416,6 +417,10 @@ HWTEST_F(EnqueueFillBufferCmdTests, WhenFillingBufferThenPatternShouldBeAligned)
HWTEST_F(EnqueueFillBufferCmdTests, WhenFillBufferIsCalledTwiceThenPatternAllocationIsReused) {
auto &csr = pCmdQ->getGpgpuCommandStreamReceiver();
if (pDevice->getProductHelper().getCommandBuffersPreallocatedPerCommandQueue() > 0) {
csr.flushTagUpdate();
csr.getInternalAllocationStorage()->cleanAllocationList(-1, AllocationUsage::REUSABLE_ALLOCATION);
}
ASSERT_TRUE(csr.getAllocationsForReuse().peekIsEmpty());
EnqueueFillBufferHelper<>::enqueueFillBuffer(pCmdQ, buffer);
ASSERT_FALSE(csr.getAllocationsForReuse().peekIsEmpty());
@@ -429,6 +434,10 @@ HWTEST_F(EnqueueFillBufferCmdTests, WhenFillBufferIsCalledTwiceThenPatternAlloca
HWTEST_F(EnqueueFillBufferCmdTests, WhenFillingBufferThenPatternOfSizeOneByteShouldGetPreparedForMiddleKernel) {
auto &csr = pCmdQ->getGpgpuCommandStreamReceiver();
if (pDevice->getProductHelper().getCommandBuffersPreallocatedPerCommandQueue() > 0) {
csr.flushTagUpdate();
csr.getInternalAllocationStorage()->cleanAllocationList(-1, AllocationUsage::REUSABLE_ALLOCATION);
}
ASSERT_TRUE(csr.getAllocationsForReuse().peekIsEmpty());
ASSERT_TRUE(csr.getTemporaryAllocations().peekIsEmpty());
@@ -462,6 +471,10 @@ HWTEST_F(EnqueueFillBufferCmdTests, WhenFillingBufferThenPatternOfSizeOneByteSho
HWTEST_F(EnqueueFillBufferCmdTests, WhenFillingBufferThenPatternOfSizeTwoBytesShouldGetPreparedForMiddleKernel) {
auto &csr = pCmdQ->getGpgpuCommandStreamReceiver();
if (pDevice->getProductHelper().getCommandBuffersPreallocatedPerCommandQueue() > 0) {
csr.flushTagUpdate();
csr.getInternalAllocationStorage()->cleanAllocationList(-1, AllocationUsage::REUSABLE_ALLOCATION);
}
ASSERT_TRUE(csr.getAllocationsForReuse().peekIsEmpty());
ASSERT_TRUE(csr.getTemporaryAllocations().peekIsEmpty());

View File

@@ -333,7 +333,8 @@ HWTEST_F(EnqueueHandlerTest, WhenEnqueuingHandlerCallOnEnqueueMarkerThenCallProc
nullptr);
EXPECT_FALSE(csr->processEvictionCalled);
EXPECT_EQ(0u, csr->madeResidentGfxAllocations.size());
const auto expectedMadeResidentGfxAllocations = pDevice->getProductHelper().getCommandBuffersPreallocatedPerCommandQueue();
EXPECT_EQ(expectedMadeResidentGfxAllocations, csr->madeResidentGfxAllocations.size());
EXPECT_EQ(0u, csr->madeNonResidentGfxAllocations.size());
}
@@ -732,24 +733,23 @@ struct EnqueueHandlerTestBasic : public ::testing::Test {
device = std::make_unique<MockClDevice>(MockDevice::createWithExecutionEnvironment<MockDevice>(nullptr, executionEnvironment, 0u));
context = std::make_unique<MockContext>(device.get());
auto mockCmdQ = std::make_unique<MockCmdQueueType>(context.get(), device.get(), nullptr);
auto &ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> &>(mockCmdQ->getGpgpuCommandStreamReceiver());
ultCsr.taskCount = initialTaskCount;
auto &ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> &>(device->getGpgpuCommandStreamReceiver());
mockInternalAllocationStorage = new MockInternalAllocationStorage(ultCsr);
ultCsr.internalAllocationStorage.reset(mockInternalAllocationStorage);
auto mockCmdQ = std::make_unique<MockCmdQueueType>(context.get(), device.get(), nullptr);
ultCsr.taskCount = initialTaskCount;
return mockCmdQ;
}
MockInternalAllocationStorage *mockInternalAllocationStorage = nullptr;
const uint32_t initialTaskCount = 100;
std::unique_ptr<MockClDevice> device;
std::unique_ptr<MockContext> context;
};
HWTEST_F(EnqueueHandlerTestBasic, givenEnqueueHandlerWhenCommandIsBlokingThenCompletionStampTaskCountIsPassedToWaitForTaskCountAndCleanAllocationListAsRequiredTaskCount) {
HWTEST_F(EnqueueHandlerTestBasic, givenEnqueueHandlerWhenCommandIsBlockingThenCompletionStampTaskCountIsPassedToWaitForTaskCountAndCleanAllocationListAsRequiredTaskCount) {
auto mockCmdQ = setupFixtureAndCreateMockCommandQueue<MockCommandQueueHw<FamilyType>, FamilyType>();
MockKernelWithInternals kernelInternals(*device, context.get());
Kernel *kernel = kernelInternals.mockKernel;

View File

@@ -793,9 +793,10 @@ HWTEST_F(EnqueueKernelTest, givenCommandStreamReceiverInBatchingModeWhenEnqueueK
size_t timestampPacketSurfacesCount = mockCsr->peekTimestampPacketWriteEnabled() ? 1 : 0;
size_t fenceSurfaceCount = mockCsr->globalFenceAllocation ? 1 : 0;
size_t clearColorSize = mockCsr->clearColorAllocation ? 1 : 0;
size_t commandBufferCount = pDevice->getProductHelper().getCommandBuffersPreallocatedPerCommandQueue() > 0 ? 0 : 1;
EXPECT_EQ(0, mockCsr->flushCalledCount);
EXPECT_EQ(5u + csrSurfaceCount + timestampPacketSurfacesCount + fenceSurfaceCount + clearColorSize, cmdBuffer->surfaces.size());
EXPECT_EQ(4u + csrSurfaceCount + timestampPacketSurfacesCount + fenceSurfaceCount + clearColorSize + commandBufferCount, cmdBuffer->surfaces.size());
}
HWTEST_F(EnqueueKernelTest, givenReducedAddressSpaceGraphicsAllocationForHostPtrWithL3FlushRequiredWhenEnqueueKernelIsCalledThenFlushIsCalledForReducedAddressSpacePlatforms) {
@@ -940,6 +941,8 @@ HWTEST_F(EnqueueKernelTest, givenCommandStreamReceiverInBatchingModeWhenKernelIs
auto mockedSubmissionsAggregator = new MockSubmissionsAggregator();
mockCsrmockCsr.submissionAggregator.reset(mockedSubmissionsAggregator);
pDevice->getGpgpuCommandStreamReceiver().flushTagUpdate(); // to clear residency allocations after preallocations
MockKernelWithInternals mockKernel(*pClDevice, context);
size_t gws[3] = {1, 0, 0};
// make sure csr emits something

View File

@@ -9,6 +9,7 @@
#include "shared/source/debug_settings/debug_settings_manager.h"
#include "shared/source/helpers/aligned_memory.h"
#include "shared/source/memory_manager/allocations_list.h"
#include "shared/source/memory_manager/internal_allocation_storage.h"
#include "shared/source/memory_manager/surface.h"
#include "shared/source/memory_manager/unified_memory_manager.h"
#include "shared/source/os_interface/device_factory.h"
@@ -862,6 +863,10 @@ TEST_F(EnqueueSvmTest, GivenRepeatCallsWhenFillingMemoryThenSuccessIsReturnedFor
TEST_F(EnqueueSvmTest, givenEnqueueSVMMemFillWhenPatternAllocationIsObtainedThenItsTypeShouldBeSetToFillPattern) {
auto &csr = pCmdQ->getGpgpuCommandStreamReceiver();
if (pDevice->getProductHelper().getCommandBuffersPreallocatedPerCommandQueue() > 0) {
csr.flushTagUpdate();
csr.getInternalAllocationStorage()->cleanAllocationList(-1, AllocationUsage::REUSABLE_ALLOCATION);
}
ASSERT_TRUE(csr.getAllocationsForReuse().peekIsEmpty());
const float pattern[1] = {1.2345f};

View File

@@ -77,9 +77,10 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, givenCsrInBatchingModeWhenFlushTas
csrSurfaceCount -= pDevice->getHardwareInfo().capabilityTable.supportsImages ? 0 : 1;
csrSurfaceCount += mockCsr->globalFenceAllocation ? 1 : 0;
csrSurfaceCount += mockCsr->clearColorAllocation ? 1 : 0;
csrSurfaceCount += pDevice->getProductHelper().getCommandBuffersPreallocatedPerCommandQueue() > 0 ? 0 : 1;
// we should have 3 heaps, tag allocation and csr command stream + cq
EXPECT_EQ(5u + csrSurfaceCount, cmdBuffer->surfaces.size());
EXPECT_EQ(4u + csrSurfaceCount, cmdBuffer->surfaces.size());
EXPECT_EQ(0, mockCsr->flushCalledCount);

View File

@@ -30,6 +30,7 @@ typedef Test<MemoryManagementFixture> ContextFailureInjection;
TEST_F(ContextFailureInjection, GivenFailedAllocationInjectionWhenCreatingContextThenOutOfHostMemoryErrorIsReturned) {
DebugManagerStateRestore restorer;
DebugManager.flags.ExperimentalSmallBufferPoolAllocator.set(0); // failing to allocate pool buffer is non-critical
DebugManager.flags.SetAmountOfReusableAllocationsPerCmdQueue.set(0); // same for preallocations
auto device = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(nullptr));
cl_device_id deviceID = device.get();

View File

@@ -190,6 +190,8 @@ TEST(CommandTest, givenWaitlistRequestWhenCommandComputeKernelIsCreatedThenMakeL
}
TEST(KernelOperationDestruction, givenKernelOperationWhenItIsDestructedThenAllAllocationsAreStoredInInternalStorageForReuse) {
DebugManagerStateRestore restorer;
DebugManager.flags.SetAmountOfReusableAllocationsPerCmdQueue.set(0);
auto device = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(defaultHwInfo.get()));
MockCommandQueue cmdQ(nullptr, device.get(), nullptr, false);
InternalAllocationStorage &allocationStorage = *device->getDefaultEngine().commandStreamReceiver->getInternalAllocationStorage();

View File

@@ -225,6 +225,7 @@ struct PerformanceCountersMetricsLibraryTest : public PerformanceCountersMetrics
void TearDown() override {
PerformanceCountersMetricsLibraryFixture::tearDown();
queue->getGpgpuCommandStreamReceiver().setupContext(*device->getDefaultEngine().osContext);
}
std::unique_ptr<OsContext> osContext;
};

View File

@@ -266,8 +266,10 @@ void CommandStreamReceiver::preallocateCommandBuffer() {
const AllocationProperties commandStreamAllocationProperties{rootDeviceIndex, true, MemoryConstants::pageSize64k, AllocationType::COMMAND_BUFFER,
isMultiOsContextCapable(), false, deviceBitfield};
auto allocation = this->getMemoryManager()->allocateGraphicsMemoryWithProperties(commandStreamAllocationProperties);
if (allocation) {
getInternalAllocationStorage()->storeAllocation(std::unique_ptr<GraphicsAllocation>(allocation), REUSABLE_ALLOCATION);
this->makeResident(*allocation);
}
}
void CommandStreamReceiver::fillReusableAllocationsList() {

View File

@@ -91,4 +91,9 @@ bool ProductHelperHw<gfxProduct>::isPlatformDp4aSupported() const {
return true;
}
template <>
uint32_t ProductHelperHw<gfxProduct>::getCommandBuffersPreallocatedPerCommandQueue() const {
return 2u;
}
} // namespace NEO

View File

@@ -173,17 +173,30 @@ HWTEST_F(CommandStreamReceiverTest, givenFlagDisabledWhenCallFillReusableAllocat
EXPECT_EQ(0u, commandStreamReceiver->getResidencyAllocations().size());
}
HWTEST_F(CommandStreamReceiverTest, givenUnsetPreallocationsPerQueueWhenRequestPreallocationCalledThenDoNotAllocateCommandBuffer) {
HWTEST_F(CommandStreamReceiverTest, givenUnsetPreallocationsPerQueueWhenRequestPreallocationCalledThenPreallocateCommandBufferCorrectly) {
EXPECT_TRUE(commandStreamReceiver->getAllocationsForReuse().peekIsEmpty());
EXPECT_EQ(0u, commandStreamReceiver->getResidencyAllocations().size());
auto &productHelper = getHelper<ProductHelper>();
const auto expectedPreallocations = productHelper.getCommandBuffersPreallocatedPerCommandQueue();
commandStreamReceiver->requestPreallocation();
if (expectedPreallocations > 0) {
EXPECT_FALSE(commandStreamReceiver->getAllocationsForReuse().peekIsEmpty());
EXPECT_EQ(expectedPreallocations, commandStreamReceiver->getResidencyAllocations().size());
} else {
EXPECT_TRUE(commandStreamReceiver->getAllocationsForReuse().peekIsEmpty());
EXPECT_EQ(0u, commandStreamReceiver->getResidencyAllocations().size());
}
commandStreamReceiver->releasePreallocationRequest();
if (expectedPreallocations > 0) {
EXPECT_FALSE(commandStreamReceiver->getAllocationsForReuse().peekIsEmpty());
EXPECT_EQ(expectedPreallocations, commandStreamReceiver->getResidencyAllocations().size());
} else {
EXPECT_TRUE(commandStreamReceiver->getAllocationsForReuse().peekIsEmpty());
EXPECT_EQ(0u, commandStreamReceiver->getResidencyAllocations().size());
}
}
HWTEST_F(CommandStreamReceiverTest, givenPreallocationsPerQueueEqualZeroWhenRequestPreallocationCalledThenDoNotAllocateCommandBuffer) {
@@ -220,6 +233,28 @@ HWTEST_F(CommandStreamReceiverTest, givenPreallocationsPerQueueWhenRequestPreall
EXPECT_EQ(2u, commandStreamReceiver->getResidencyAllocations().size());
}
HWTEST_F(CommandStreamReceiverTest, givenPreallocationsPerQueueWhenRequestPreallocationCalledButAllocationFailedThenRequestIsIgnored) {
DebugManagerStateRestore restorer;
DebugManager.flags.SetAmountOfReusableAllocationsPerCmdQueue.set(1);
EXPECT_TRUE(commandStreamReceiver->getAllocationsForReuse().peekIsEmpty());
EXPECT_EQ(0u, commandStreamReceiver->getResidencyAllocations().size());
// make allocation fail
ExecutionEnvironment &executionEnvironment = *pDevice->getExecutionEnvironment();
auto memoryManagerBackup = executionEnvironment.memoryManager.release();
executionEnvironment.memoryManager.reset(new FailMemoryManager(executionEnvironment));
commandStreamReceiver->requestPreallocation();
EXPECT_TRUE(commandStreamReceiver->getAllocationsForReuse().peekIsEmpty());
EXPECT_EQ(0u, commandStreamReceiver->getResidencyAllocations().size());
// make allocation succeed
executionEnvironment.memoryManager.reset(memoryManagerBackup);
commandStreamReceiver->requestPreallocation();
EXPECT_FALSE(commandStreamReceiver->getAllocationsForReuse().peekIsEmpty());
EXPECT_EQ(1u, commandStreamReceiver->getResidencyAllocations().size());
}
HWTEST_F(CommandStreamReceiverTest, whenRegisterClientThenIncrementClientNum) {
auto &csr = pDevice->getUltCommandStreamReceiver<FamilyType>();
auto numClients = csr.getNumClients();