performance(ocl): cmd buffer prealloc per cmdqueue

Add mechanism to preallocate cmd buffer allocations in command stream
receiver reusable allocations list per command queue initialized.

This should limit additional allocations during hot loop.

Needs to be enabled in subsequent commits by setting product helper
method.

Related-To: NEO-8152

Signed-off-by: Dominik Dabek <dominik.dabek@intel.com>
This commit is contained in:
Dominik Dabek
2023-10-27 15:54:45 +00:00
committed by Compute-Runtime-Automation
parent cfbf6219fe
commit 39cf653959
13 changed files with 120 additions and 6 deletions

View File

@@ -152,6 +152,9 @@ CommandQueue::~CommandQueue() {
if (NEO::Debugger::isDebugEnabled(isInternalUsage) && device->getDevice().getL0Debugger()) {
device->getDevice().getL0Debugger()->notifyCommandQueueDestroyed(&device->getDevice());
}
if (gpgpuEngine) {
gpgpuEngine->commandStreamReceiver->releasePreallocationRequest();
}
}
timestampPacketContainer.reset();
@@ -215,6 +218,7 @@ void CommandQueue::initializeGpgpuInternals() const {
}
gpgpuEngine->commandStreamReceiver->initializeResources();
gpgpuEngine->commandStreamReceiver->requestPreallocation();
gpgpuEngine->commandStreamReceiver->initDirectSubmission();
if (getCmdQueueProperties<cl_queue_properties>(propertiesVector.data(), CL_QUEUE_PROPERTIES) & static_cast<cl_queue_properties>(CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) && !this->gpgpuEngine->commandStreamReceiver->isUpdateTagFromWaitEnabled()) {

View File

@@ -981,6 +981,30 @@ HWTEST_F(CommandQueueTests, givenNodeOrdinalSetWithCcsEngineWhenCreatingCommandQ
delete pCmdQ;
}
HWTEST_F(CommandQueueTests, givenPreallocationsPerQueueWhenInitializeGpgpuCalledThenCSRRequestPreallocationIsCalled) {
auto device = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(defaultHwInfo.get()));
MockContext context(device.get());
auto mockCmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(&context, device.get(), nullptr);
auto &commandStreamReceiver = device->getUltCommandStreamReceiver<FamilyType>();
DebugManagerStateRestore restorer;
DebugManager.flags.SetAmountOfReusableAllocationsPerCmdQueue.set(1);
EXPECT_EQ(0u, commandStreamReceiver.requestedPreallocationsAmount);
EXPECT_TRUE(commandStreamReceiver.getAllocationsForReuse().peekIsEmpty());
EXPECT_EQ(0u, commandStreamReceiver.getResidencyAllocations().size());
mockCmdQ->initializeGpgpu();
EXPECT_EQ(1u, commandStreamReceiver.requestedPreallocationsAmount);
EXPECT_FALSE(commandStreamReceiver.getAllocationsForReuse().peekIsEmpty());
EXPECT_EQ(1u, commandStreamReceiver.getResidencyAllocations().size());
mockCmdQ.reset();
EXPECT_EQ(0u, commandStreamReceiver.requestedPreallocationsAmount);
EXPECT_FALSE(commandStreamReceiver.getAllocationsForReuse().peekIsEmpty());
EXPECT_EQ(1u, commandStreamReceiver.getResidencyAllocations().size());
}
struct WaitForQueueCompletionTests : public ::testing::Test {
template <typename Family>
struct MyCmdQueue : public CommandQueueHw<Family> {

View File

@@ -1210,7 +1210,7 @@ HWTEST_F(EventTest, givenVirtualEventWhenCommandSubmittedThenLockCsrOccurs) {
virtualEvent->submitCommand(false);
uint32_t expectedLockCounter = pDevice->getDefaultEngine().commandStreamReceiver->getClearColorAllocation() ? 4u : 3u;
uint32_t expectedLockCounter = pDevice->getDefaultEngine().commandStreamReceiver->getClearColorAllocation() ? 5u : 4u;
EXPECT_EQ(expectedLockCounter, pDevice->getUltCommandStreamReceiver<FamilyType>().recursiveLockCounter);
}

View File

@@ -258,18 +258,48 @@ void CommandStreamReceiver::ensureCommandBufferAllocation(LinearStream &commandS
commandStream.replaceGraphicsAllocation(allocation);
}
void CommandStreamReceiver::preallocateCommandBuffer() {
const AllocationProperties commandStreamAllocationProperties{rootDeviceIndex, true, MemoryConstants::pageSize64k, AllocationType::COMMAND_BUFFER,
isMultiOsContextCapable(), false, deviceBitfield};
auto allocation = this->getMemoryManager()->allocateGraphicsMemoryWithProperties(commandStreamAllocationProperties);
getInternalAllocationStorage()->storeAllocation(std::unique_ptr<GraphicsAllocation>(allocation), REUSABLE_ALLOCATION);
this->makeResident(*allocation);
}
void CommandStreamReceiver::fillReusableAllocationsList() {
auto &gfxCoreHelper = getGfxCoreHelper();
auto amountToFill = gfxCoreHelper.getAmountOfAllocationsToFill();
for (auto i = 0u; i < amountToFill; i++) {
const AllocationProperties commandStreamAllocationProperties{rootDeviceIndex, true, MemoryConstants::pageSize64k, AllocationType::COMMAND_BUFFER,
isMultiOsContextCapable(), false, deviceBitfield};
auto allocation = this->getMemoryManager()->allocateGraphicsMemoryWithProperties(commandStreamAllocationProperties);
getInternalAllocationStorage()->storeAllocation(std::unique_ptr<GraphicsAllocation>(allocation), REUSABLE_ALLOCATION);
this->makeResident(*allocation);
preallocateCommandBuffer();
}
}
void CommandStreamReceiver::requestPreallocation() {
auto preallocationsPerQueue = getProductHelper().getCommandBuffersPreallocatedPerCommandQueue();
if (DebugManager.flags.SetAmountOfReusableAllocationsPerCmdQueue.get() != -1) {
preallocationsPerQueue = DebugManager.flags.SetAmountOfReusableAllocationsPerCmdQueue.get();
}
auto lock = obtainUniqueOwnership();
requestedPreallocationsAmount += preallocationsPerQueue;
const int64_t amountToPreallocate = static_cast<int64_t>(requestedPreallocationsAmount.load()) - preallocatedAmount;
DEBUG_BREAK_IF(amountToPreallocate > preallocationsPerQueue);
if (amountToPreallocate > 0) {
for (auto i = 0u; i < amountToPreallocate; i++) {
preallocateCommandBuffer();
}
preallocatedAmount += static_cast<uint32_t>(amountToPreallocate);
}
}
void CommandStreamReceiver::releasePreallocationRequest() {
auto preallocationsPerQueue = getProductHelper().getCommandBuffersPreallocatedPerCommandQueue();
if (DebugManager.flags.SetAmountOfReusableAllocationsPerCmdQueue.get() != -1) {
preallocationsPerQueue = DebugManager.flags.SetAmountOfReusableAllocationsPerCmdQueue.get();
}
DEBUG_BREAK_IF(preallocationsPerQueue > requestedPreallocationsAmount);
requestedPreallocationsAmount -= preallocationsPerQueue;
}
bool CommandStreamReceiver::initializeResources() {
if (!resourcesInitialized) {
auto lock = obtainUniqueOwnership();

View File

@@ -432,6 +432,10 @@ class CommandStreamReceiver {
virtual bool waitUserFence(TaskCountType waitValue, uint64_t hostAddress, int64_t timeout) { return false; }
void requestPreallocation();
void releasePreallocationRequest();
void preallocateCommandBuffer();
protected:
void cleanupResources();
void printDeviceIndex();
@@ -448,6 +452,9 @@ class CommandStreamReceiver {
std::unique_ptr<FlatBatchBufferHelper> flatBatchBufferHelper;
std::unique_ptr<ExperimentalCommandBuffer> experimentalCmdBuffer;
std::unique_ptr<InternalAllocationStorage> internalAllocationStorage;
std::atomic<uint32_t> preallocatedAmount{0};
std::atomic<uint32_t> requestedPreallocationsAmount{0};
std::unique_ptr<KmdNotifyHelper> kmdNotifyHelper;
std::unique_ptr<ScratchSpaceController> scratchSpaceController;
std::unique_ptr<TagAllocatorBase> profilingTimeStampAllocator;

View File

@@ -345,6 +345,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, SplitBcsMaskH2D, 0, "0: default, >0: bitmask: in
DECLARE_DEBUG_VARIABLE(int32_t, SplitBcsMaskD2H, 0, "0: default, >0: bitmask: indicates bcs engines for D2H split")
DECLARE_DEBUG_VARIABLE(int32_t, ReuseKernelBinaries, -1, "-1: default, 0:disabled, 1: enabled. If enabled, driver reuses kernel binaries.")
DECLARE_DEBUG_VARIABLE(int32_t, SetAmountOfReusableAllocations, -1, "-1: default, 0:disabled, > 1: enabled. If enabled, driver will fill reusable allocation lists with given amount of command buffers and heaps at initialization of immediate command list.")
DECLARE_DEBUG_VARIABLE(int32_t, SetAmountOfReusableAllocationsPerCmdQueue, -1, "-1: default, 0:disabled, > 1: enabled. If enabled, driver will fill reusable allocation lists with given amount of command buffers for each initialized opencl command queue.")
DECLARE_DEBUG_VARIABLE(int32_t, UseHighAlignmentForHeapExtended, -1, "-1: default, 0:disabled, > 1: enabled. If enabled, driver aligns HEAP_EXTENDED allocations to GPU VA that is next power of 2 for a given size, if disables GPU VA is using 2MB/64KB alignment.")
DECLARE_DEBUG_VARIABLE(int32_t, DispatchCmdlistCmdBufferPrimary, -1, "-1: default, 0: dispatch command buffers as seconadry, 1: dispatch command buffers as primary and chain")
DECLARE_DEBUG_VARIABLE(int32_t, UseImmediateFlushTask, -1, "-1: default, 0: use regular flush task, 1: use immediate flush task")

View File

@@ -165,6 +165,7 @@ class ProductHelper {
virtual bool isLinearStoragePreferred(bool isImage1d, bool forceLinearStorage) const = 0;
virtual bool isTranslationExceptionSupported() const = 0;
virtual uint32_t getMaxNumSamplers() const = 0;
virtual uint32_t getCommandBuffersPreallocatedPerCommandQueue() const = 0;
virtual bool getFrontEndPropertyScratchSizeSupport() const = 0;
virtual bool getFrontEndPropertyPrivateScratchSizeSupport() const = 0;

View File

@@ -778,6 +778,11 @@ uint32_t ProductHelperHw<gfxProduct>::getMaxNumSamplers() const {
return 16u;
}
template <PRODUCT_FAMILY gfxProduct>
uint32_t ProductHelperHw<gfxProduct>::getCommandBuffersPreallocatedPerCommandQueue() const {
return 0u;
}
template <PRODUCT_FAMILY gfxProduct>
bool ProductHelperHw<gfxProduct>::disableL3CacheForDebug(const HardwareInfo &) const {
return false;

View File

@@ -119,6 +119,7 @@ class ProductHelperHw : public ProductHelper {
bool isLinearStoragePreferred(bool isImage1d, bool forceLinearStorage) const override;
bool isTranslationExceptionSupported() const override;
uint32_t getMaxNumSamplers() const override;
uint32_t getCommandBuffersPreallocatedPerCommandQueue() const override;
bool getFrontEndPropertyScratchSizeSupport() const override;
bool getFrontEndPropertyPrivateScratchSizeSupport() const override;

View File

@@ -135,6 +135,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw<GfxFamily>, publ
using BaseClass::CommandStreamReceiver::perfCounterAllocator;
using BaseClass::CommandStreamReceiver::pipelineSupportFlags;
using BaseClass::CommandStreamReceiver::profilingTimeStampAllocator;
using BaseClass::CommandStreamReceiver::requestedPreallocationsAmount;
using BaseClass::CommandStreamReceiver::requiredPrivateScratchSize;
using BaseClass::CommandStreamReceiver::requiredScratchSize;
using BaseClass::CommandStreamReceiver::resourcesInitialized;

View File

@@ -367,6 +367,11 @@ uint32_t ProductHelperHw<IGFX_UNKNOWN>::getMaxNumSamplers() const {
return 0u;
}
template <>
uint32_t ProductHelperHw<IGFX_UNKNOWN>::getCommandBuffersPreallocatedPerCommandQueue() const {
return 0u;
}
template <>
uint32_t L1CachePolicyHelper<IGFX_UNKNOWN>::getL1CachePolicy(bool isDebuggerActive) {
return L1CachePolicyHelper<IGFX_UNKNOWN>::getDefaultL1CachePolicy(isDebuggerActive);

View File

@@ -556,4 +556,5 @@ OverridePatIndexForDeviceMemory = -1
PrintGmmCompressionParams = 0
SkipInOrderNonWalkerSignalingAllowed = 0
PrintKernelDispatchParameters = 0
SetAmountOfReusableAllocationsPerCmdQueue = -1
# Please don't edit below this line

View File

@@ -173,6 +173,40 @@ HWTEST_F(CommandStreamReceiverTest, givenFlagDisabledWhenCallFillReusableAllocat
EXPECT_EQ(0u, commandStreamReceiver->getResidencyAllocations().size());
}
HWTEST_F(CommandStreamReceiverTest, givenPreallocationsPerQueueEqualZeroWhenRequestPreallocationCalledThenDoNotAllocateCommandBuffer) {
DebugManagerStateRestore restorer;
DebugManager.flags.SetAmountOfReusableAllocationsPerCmdQueue.set(0);
EXPECT_TRUE(commandStreamReceiver->getAllocationsForReuse().peekIsEmpty());
EXPECT_EQ(0u, commandStreamReceiver->getResidencyAllocations().size());
commandStreamReceiver->requestPreallocation();
EXPECT_TRUE(commandStreamReceiver->getAllocationsForReuse().peekIsEmpty());
EXPECT_EQ(0u, commandStreamReceiver->getResidencyAllocations().size());
}
HWTEST_F(CommandStreamReceiverTest, givenPreallocationsPerQueueWhenRequestPreallocationCalledThenAllocateCommandBufferIfNeeded) {
DebugManagerStateRestore restorer;
DebugManager.flags.SetAmountOfReusableAllocationsPerCmdQueue.set(1);
EXPECT_TRUE(commandStreamReceiver->getAllocationsForReuse().peekIsEmpty());
EXPECT_EQ(0u, commandStreamReceiver->getResidencyAllocations().size());
commandStreamReceiver->requestPreallocation();
EXPECT_FALSE(commandStreamReceiver->getAllocationsForReuse().peekIsEmpty());
EXPECT_EQ(1u, commandStreamReceiver->getResidencyAllocations().size());
commandStreamReceiver->releasePreallocationRequest();
EXPECT_FALSE(commandStreamReceiver->getAllocationsForReuse().peekIsEmpty());
EXPECT_EQ(1u, commandStreamReceiver->getResidencyAllocations().size());
commandStreamReceiver->requestPreallocation();
EXPECT_FALSE(commandStreamReceiver->getAllocationsForReuse().peekIsEmpty());
EXPECT_EQ(1u, commandStreamReceiver->getResidencyAllocations().size());
commandStreamReceiver->requestPreallocation();
EXPECT_FALSE(commandStreamReceiver->getAllocationsForReuse().peekIsEmpty());
EXPECT_EQ(2u, commandStreamReceiver->getResidencyAllocations().size());
}
HWTEST_F(CommandStreamReceiverTest, whenRegisterClientThenIncrementClientNum) {
auto &csr = pDevice->getUltCommandStreamReceiver<FamilyType>();
auto numClients = csr.getNumClients();