Prealloc cmd buffer for CSR only when being used

Related-To: NEO-7361

Currently additional command buffer is
preallocated for all CSRs, even for those which
won't be used by application. This PR changes that

Signed-off-by: Szymon Morek <szymon.morek@intel.com>
This commit is contained in:
Szymon Morek
2022-11-03 15:25:30 +00:00
committed by Compute-Runtime-Automation
parent 3a58579bbe
commit a66e69abc9
18 changed files with 66 additions and 37 deletions

View File

@ -196,7 +196,7 @@ CommandQueue *CommandQueue::create(uint32_t productFamily, Device *device, NEO::
osContext.setUmdPowerHintValue(driverHandleImp->powerHint);
osContext.reInitializeContext();
}
osContext.ensureContextInitialized();
csr->initializeResources();
csr->initDirectSubmission();
csr->registerClient();
return commandQueue;

View File

@ -195,7 +195,7 @@ void CommandQueue::initializeGpgpuInternals() const {
}
}
gpgpuEngine->osContext->ensureContextInitialized();
gpgpuEngine->commandStreamReceiver->initializeResources();
gpgpuEngine->commandStreamReceiver->initDirectSubmission();
if (getCmdQueueProperties<cl_queue_properties>(propertiesVector.data(), CL_QUEUE_PROPERTIES) & static_cast<cl_queue_properties>(CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) && !this->gpgpuEngine->commandStreamReceiver->isUpdateTagFromWaitEnabled()) {
@ -340,7 +340,7 @@ void CommandQueue::constructBcsEnginesForSplit() {
bcsEngines[i] = neoDevice.tryGetEngine(engineType, EngineUsage::Regular);
bcsEngineTypes.push_back(engineType);
if (bcsEngines[i]) {
bcsEngines[i]->osContext->ensureContextInitialized();
bcsEngines[i]->commandStreamReceiver->initializeResources();
bcsEngines[i]->commandStreamReceiver->initDirectSubmission();
}
}

View File

@ -365,18 +365,18 @@ HWTEST_F(IoqCommandQueueHwBlitTest, givenSplitBcsCopyWhenEnqueueReadThenEnqueueB
memoryManager->returnFakeAllocation = true;
auto cmdQHw = std::make_unique<MockCommandQueueHw<FamilyType>>(context, pClDevice, nullptr);
auto csr1 = std::make_unique<CommandStreamReceiverHw<FamilyType>>(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield());
std::unique_ptr<OsContext> osContext1(OsContext::create(pDevice->getExecutionEnvironment()->rootDeviceEnvironments[0]->osInterface.get(), 0,
EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS1, EngineUsage::Regular},
PreemptionMode::ThreadGroup, pDevice->getDeviceBitfield())));
auto csr1 = std::make_unique<CommandStreamReceiverHw<FamilyType>>(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield());
csr1->setupContext(*osContext1);
csr1->initializeTagAllocation();
EngineControl control1(csr1.get(), osContext1.get());
auto csr2 = std::make_unique<CommandStreamReceiverHw<FamilyType>>(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield());
std::unique_ptr<OsContext> osContext2(OsContext::create(pDevice->getExecutionEnvironment()->rootDeviceEnvironments[0]->osInterface.get(), 0,
EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS3, EngineUsage::Regular},
PreemptionMode::ThreadGroup, pDevice->getDeviceBitfield())));
auto csr2 = std::make_unique<CommandStreamReceiverHw<FamilyType>>(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield());
csr2->setupContext(*osContext2);
csr2->initializeTagAllocation();
EngineControl control2(csr2.get(), osContext2.get());
@ -415,18 +415,18 @@ HWTEST_F(IoqCommandQueueHwBlitTest, givenSplitBcsCopyWhenEnqueueBlockingReadThen
memoryManager->returnFakeAllocation = true;
auto cmdQHw = static_cast<MockCommandQueueHw<FamilyType> *>(this->pCmdQ);
auto csr1 = std::make_unique<CommandStreamReceiverHw<FamilyType>>(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield());
std::unique_ptr<OsContext> osContext1(OsContext::create(pDevice->getExecutionEnvironment()->rootDeviceEnvironments[0]->osInterface.get(), 0,
EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS1, EngineUsage::Regular},
PreemptionMode::ThreadGroup, pDevice->getDeviceBitfield())));
auto csr1 = std::make_unique<CommandStreamReceiverHw<FamilyType>>(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield());
csr1->setupContext(*osContext1);
csr1->initializeTagAllocation();
EngineControl control1(csr1.get(), osContext1.get());
auto csr2 = std::make_unique<CommandStreamReceiverHw<FamilyType>>(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield());
std::unique_ptr<OsContext> osContext2(OsContext::create(pDevice->getExecutionEnvironment()->rootDeviceEnvironments[0]->osInterface.get(), 0,
EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS3, EngineUsage::Regular},
PreemptionMode::ThreadGroup, pDevice->getDeviceBitfield())));
auto csr2 = std::make_unique<CommandStreamReceiverHw<FamilyType>>(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield());
csr2->setupContext(*osContext2);
csr2->initializeTagAllocation();
EngineControl control2(csr2.get(), osContext2.get());
@ -464,18 +464,18 @@ HWTEST_F(IoqCommandQueueHwBlitTest, givenSplitBcsCopyWhenEnqueueReadWithEventThe
memoryManager->returnFakeAllocation = true;
auto cmdQHw = static_cast<MockCommandQueueHw<FamilyType> *>(this->pCmdQ);
auto csr1 = std::make_unique<CommandStreamReceiverHw<FamilyType>>(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield());
std::unique_ptr<OsContext> osContext1(OsContext::create(pDevice->getExecutionEnvironment()->rootDeviceEnvironments[0]->osInterface.get(), 0,
EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS1, EngineUsage::Regular},
PreemptionMode::ThreadGroup, pDevice->getDeviceBitfield())));
auto csr1 = std::make_unique<CommandStreamReceiverHw<FamilyType>>(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield());
csr1->setupContext(*osContext1);
csr1->initializeTagAllocation();
EngineControl control1(csr1.get(), osContext1.get());
auto csr2 = std::make_unique<CommandStreamReceiverHw<FamilyType>>(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield());
std::unique_ptr<OsContext> osContext2(OsContext::create(pDevice->getExecutionEnvironment()->rootDeviceEnvironments[0]->osInterface.get(), 0,
EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS3, EngineUsage::Regular},
PreemptionMode::ThreadGroup, pDevice->getDeviceBitfield())));
auto csr2 = std::make_unique<CommandStreamReceiverHw<FamilyType>>(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield());
csr2->setupContext(*osContext2);
csr2->initializeTagAllocation();
EngineControl control2(csr2.get(), osContext2.get());

View File

@ -324,9 +324,9 @@ HWCMDTEST_F(IGFX_GEN8_CORE, CommandStreamReceiverFlushTaskTests, givenCommandStr
auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver<FamilyType>();
configureCSRtoNonDirtyState<FamilyType>(false);
auto currlockCounter = commandStreamReceiver.recursiveLockCounter.load();
commandStreamReceiver.registerInstructionCacheFlush();
EXPECT_EQ(1u, commandStreamReceiver.recursiveLockCounter);
EXPECT_EQ(currlockCounter + 1, commandStreamReceiver.recursiveLockCounter);
flushTask(commandStreamReceiver);

View File

@ -1002,7 +1002,7 @@ HWTEST_F(BcsTests, givenBltSizeWithLeftoverWhenDispatchedThenProgramAllRequiredC
uint32_t newTaskCount = 19;
csr.taskCount = newTaskCount - 1;
uint32_t expectedResursiveLockCount = 0u;
uint32_t expectedResursiveLockCount = csr.resourcesInitialized ? 1u : 0u;
EXPECT_EQ(expectedResursiveLockCount, csr.recursiveLockCounter.load());
auto blitProperties = BlitProperties::constructPropertiesForReadWrite(BlitterConstants::BlitDirection::HostPtrToBuffer,
csr, buffer->getGraphicsAllocation(pDevice->getRootDeviceIndex()), nullptr, hostPtr,
@ -1013,12 +1013,16 @@ HWTEST_F(BcsTests, givenBltSizeWithLeftoverWhenDispatchedThenProgramAllRequiredC
}
EXPECT_EQ(expectedResursiveLockCount, csr.recursiveLockCounter.load());
bool areResourcesInitialized = csr.resourcesInitialized;
flushBcsTask(&csr, blitProperties, true, *pDevice);
EXPECT_EQ(newTaskCount, csr.taskCount);
EXPECT_EQ(newTaskCount, csr.latestFlushedTaskCount);
EXPECT_EQ(newTaskCount, csr.latestSentTaskCount);
EXPECT_EQ(newTaskCount, csr.latestSentTaskCountValueDuringFlush);
expectedResursiveLockCount++;
if (areResourcesInitialized != csr.resourcesInitialized) {
expectedResursiveLockCount++;
}
EXPECT_EQ(expectedResursiveLockCount, csr.recursiveLockCounter.load());
HardwareParse hwParser;

View File

@ -1197,7 +1197,7 @@ HWTEST_F(EventTest, givenVirtualEventWhenCommandSubmittedThenLockCsrOccurs) {
virtualEvent->submitCommand(false);
uint32_t expectedLockCounter = pDevice->getDefaultEngine().commandStreamReceiver->getClearColorAllocation() ? 3u : 2u;
uint32_t expectedLockCounter = pDevice->getDefaultEngine().commandStreamReceiver->getClearColorAllocation() ? 4u : 3u;
EXPECT_EQ(expectedLockCounter, pDevice->getUltCommandStreamReceiver<FamilyType>().recursiveLockCounter);
}
@ -1212,10 +1212,10 @@ HWTEST_F(EventTest, givenVirtualEventWhenSubmitCommandEventNotReadyAndEventWitho
};
auto virtualEvent = makeReleaseable<MockEvent>(pCmdQ, CL_COMMAND_NDRANGE_KERNEL, CompletionStamp::notReady, CompletionStamp::notReady);
auto currLockCounter = pDevice->getUltCommandStreamReceiver<FamilyType>().recursiveLockCounter.load();
virtualEvent->submitCommand(false);
EXPECT_EQ(pDevice->getUltCommandStreamReceiver<FamilyType>().recursiveLockCounter, 1u);
EXPECT_EQ(pDevice->getUltCommandStreamReceiver<FamilyType>().recursiveLockCounter, currLockCounter + 1);
}
HWTEST_F(InternalsEventTest, GivenBufferWithoutZeroCopyWhenMappingOrUnmappingThenFlushPreviousTasksBeforeMappingOrUnmapping) {

View File

@ -238,13 +238,24 @@ void CommandStreamReceiver::fillReusableAllocationsList() {
auto amountToFill = HwHelper::get(peekHwInfo().platform.eRenderCoreFamily).getAmountOfAllocationsToFill();
for (auto i = 0u; i < amountToFill; i++) {
const AllocationProperties commandStreamAllocationProperties{rootDeviceIndex, true, MemoryConstants::pageSize64k, AllocationType::COMMAND_BUFFER,
isMultiOsContextCapable(), false, osContext->getDeviceBitfield()};
isMultiOsContextCapable(), false, deviceBitfield};
auto allocation = this->getMemoryManager()->allocateGraphicsMemoryWithProperties(commandStreamAllocationProperties);
getInternalAllocationStorage()->storeAllocation(std::unique_ptr<GraphicsAllocation>(allocation), REUSABLE_ALLOCATION);
this->makeResident(*allocation);
}
}
void CommandStreamReceiver::initializeResources() {
if (!resourcesInitialized) {
auto lock = obtainUniqueOwnership();
if (!resourcesInitialized) {
osContext->ensureContextInitialized();
this->fillReusableAllocationsList();
this->resourcesInitialized = true;
}
}
}
MemoryManager *CommandStreamReceiver::getMemoryManager() const {
DEBUG_BREAK_IF(!executionEnvironment.memoryManager);
return executionEnvironment.memoryManager.get();

View File

@ -214,7 +214,7 @@ class CommandStreamReceiver {
virtual void fillReusableAllocationsList();
virtual void setupContext(OsContext &osContext) { this->osContext = &osContext; }
OsContext &getOsContext() const { return *osContext; }
void initializeResources();
TagAllocatorBase *getEventTsAllocator();
TagAllocatorBase *getEventPerfCountAllocator(const uint32_t tagSize);
virtual TagAllocatorBase *getTimestampPacketAllocator() = 0;
@ -500,6 +500,7 @@ class CommandStreamReceiver {
bool useNotifyEnableForPostSync = false;
bool dcFlushSupport = false;
bool forceSkipResourceCleanupRequired = false;
volatile bool resourcesInitialized = false;
};
typedef CommandStreamReceiver *(*CommandStreamReceiverCreateFunc)(bool withAubDump,

View File

@ -1074,7 +1074,7 @@ uint32_t CommandStreamReceiverHw<GfxFamily>::flushBcsTask(const BlitPropertiesCo
auto newTaskCount = taskCount + 1;
latestSentTaskCount = newTaskCount;
getOsContext().ensureContextInitialized();
this->initializeResources();
this->initDirectSubmission();
const auto &hwInfo = this->peekHwInfo();

View File

@ -349,10 +349,10 @@ bool Device::createEngine(uint32_t deviceCsrIndex, EngineTypeUsage engineTypeUsa
EngineDescriptor engineDescriptor(engineTypeUsage, getDeviceBitfield(), preemptionMode, false, createAsEngineInstanced);
auto osContext = executionEnvironment->memoryManager->createAndRegisterOsContext(commandStreamReceiver.get(), engineDescriptor);
if (osContext->isImmediateContextInitializationEnabled(isDefaultEngine)) {
osContext->ensureContextInitialized();
}
commandStreamReceiver->setupContext(*osContext);
if (osContext->isImmediateContextInitializationEnabled(isDefaultEngine)) {
commandStreamReceiver->initializeResources();
}
if (!commandStreamReceiver->initializeTagAllocation()) {
return false;
@ -383,7 +383,6 @@ bool Device::createEngine(uint32_t deviceCsrIndex, EngineTypeUsage engineTypeUsa
if (engineUsage == EngineUsage::Regular) {
addEngineToEngineGroup(engine);
}
commandStreamReceiver->fillReusableAllocationsList();
commandStreamReceivers.push_back(std::move(commandStreamReceiver));
return true;

View File

@ -62,8 +62,8 @@ void RootDevice::initializeRootCommandStreamReceiver() {
auto osContext = getMemoryManager()->createAndRegisterOsContext(rootCommandStreamReceiver.get(), engineDescriptor);
osContext->ensureContextInitialized();
rootCommandStreamReceiver->setupContext(*osContext);
rootCommandStreamReceiver->initializeResources();
rootCommandStreamReceiver->initializeTagAllocation();
rootCommandStreamReceiver->createGlobalFenceAllocation();
rootCommandStreamReceiver->createWorkPartitionAllocation(*this);

View File

@ -200,7 +200,7 @@ BlitOperationResult BlitHelper::blitMemoryToAllocationBanks(const Device &device
return BlitOperationResult::Unsupported;
}
bcsEngine->osContext->ensureContextInitialized();
bcsEngine->commandStreamReceiver->initializeResources();
bcsEngine->commandStreamReceiver->initDirectSubmission();
BlitPropertiesContainer blitPropertiesContainer;
blitPropertiesContainer.push_back(

View File

@ -28,7 +28,7 @@ DrmMemoryOperationsHandlerBind::~DrmMemoryOperationsHandlerBind() = default;
MemoryOperationsStatus DrmMemoryOperationsHandlerBind::makeResident(Device *device, ArrayRef<GraphicsAllocation *> gfxAllocations) {
auto &engines = device->getAllEngines();
for (const auto &engine : engines) {
engine.osContext->ensureContextInitialized();
engine.commandStreamReceiver->initializeResources();
this->makeResidentWithinOsContext(engine.osContext, gfxAllocations, false);
}
return MemoryOperationsStatus::SUCCESS;

View File

@ -117,6 +117,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw<GfxFamily>, publ
using BaseClass::CommandStreamReceiver::profilingTimeStampAllocator;
using BaseClass::CommandStreamReceiver::requiredPrivateScratchSize;
using BaseClass::CommandStreamReceiver::requiredScratchSize;
using BaseClass::CommandStreamReceiver::resourcesInitialized;
using BaseClass::CommandStreamReceiver::samplerCacheFlushRequired;
using BaseClass::CommandStreamReceiver::scratchSpaceController;
using BaseClass::CommandStreamReceiver::stallingCommandsOnNextFlushRequired;

View File

@ -93,6 +93,10 @@ class TestedDrmCommandStreamReceiver : public DrmCommandStreamReceiver<GfxFamily
}
}
void fillReusableAllocationsList() override {
fillReusableAllocationsListCalled++;
}
struct WaitUserFenceResult {
uint32_t called = 0u;
uint32_t waitValue = 0u;
@ -125,6 +129,7 @@ class TestedDrmCommandStreamReceiver : public DrmCommandStreamReceiver<GfxFamily
}
void *latestReadBackAddress = nullptr;
uint32_t fillReusableAllocationsListCalled = 0;
};
template <typename GfxFamily>

View File

@ -88,12 +88,15 @@ HWTEST_F(CommandStreamReceiverTest, WhenCreatingCsrThenDefaultValuesAreSet) {
EXPECT_FALSE(csr.isPreambleSent);
}
HWTEST_F(CommandStreamReceiverTest, WhenCreatingCsrThenCallFillReusableAllocationsList) {
auto engineType = pDevice->allEngines[0].getEngineType();
pDevice->createEngine(0, {engineType, EngineUsage::Regular});
auto csrIndex = pDevice->commandStreamReceivers.size() - 1;
auto csr = reinterpret_cast<UltCommandStreamReceiver<FamilyType> *>(pDevice->commandStreamReceivers[csrIndex].get());
EXPECT_EQ(1u, csr->fillReusableAllocationsListCalled);
HWTEST_F(CommandStreamReceiverTest, WhenInitializeResourcesThenCallFillReusableAllocationsListOnce) {
auto &ultCsr = pDevice->getUltCommandStreamReceiver<FamilyType>();
ultCsr.fillReusableAllocationsListCalled = 0u;
ultCsr.resourcesInitialized = false;
commandStreamReceiver->initializeResources();
EXPECT_EQ(1u, pDevice->getUltCommandStreamReceiver<FamilyType>().fillReusableAllocationsListCalled);
commandStreamReceiver->initializeResources();
EXPECT_EQ(1u, pDevice->getUltCommandStreamReceiver<FamilyType>().fillReusableAllocationsListCalled);
}
HWTEST_F(CommandStreamReceiverTest, givenCsrWhenCallFillReusableAllocationsListThenAllocateCommandBufferAndMakeItResident) {

View File

@ -108,11 +108,11 @@ HWTEST_F(TbxCommandStreamTests, givenTbxCommandStreamReceiverWhenMakeResidentIsC
auto graphicsAllocation = memoryManager->allocateGraphicsMemoryWithProperties(MockAllocationProperties{pCommandStreamReceiver->getRootDeviceIndex(), MemoryConstants::pageSize});
ASSERT_NE(nullptr, graphicsAllocation);
EXPECT_EQ(0u, tbxCsr->getResidencyAllocations().size());
auto currResidencyAllocationsSize = tbxCsr->getResidencyAllocations().size();
tbxCsr->makeResident(*graphicsAllocation);
EXPECT_EQ(1u, tbxCsr->getResidencyAllocations().size());
EXPECT_EQ(currResidencyAllocationsSize + 1, tbxCsr->getResidencyAllocations().size());
memoryManager->freeGraphicsMemory(graphicsAllocation);
}
@ -144,16 +144,15 @@ HWTEST_F(TbxCommandStreamTests, givenTbxCommandStreamReceiverWhenMakeResidentHas
auto graphicsAllocation = memoryManager->allocateGraphicsMemoryWithProperties(MockAllocationProperties{pCommandStreamReceiver->getRootDeviceIndex(), MemoryConstants::pageSize});
ASSERT_NE(nullptr, graphicsAllocation);
EXPECT_EQ(0u, tbxCsr->getResidencyAllocations().size());
auto currResidencyAllocationsSize = tbxCsr->getResidencyAllocations().size();
tbxCsr->makeResident(*graphicsAllocation);
EXPECT_EQ(1u, tbxCsr->getResidencyAllocations().size());
EXPECT_EQ(currResidencyAllocationsSize + 1, tbxCsr->getResidencyAllocations().size());
tbxCsr->makeResident(*graphicsAllocation);
EXPECT_EQ(1u, tbxCsr->getResidencyAllocations().size());
EXPECT_EQ(currResidencyAllocationsSize + 1, tbxCsr->getResidencyAllocations().size());
memoryManager->freeGraphicsMemory(graphicsAllocation);
}

View File

@ -61,6 +61,7 @@ class WddmCommandStreamFixture {
void setUp() {
HardwareInfo *hwInfo = nullptr;
DebugManager.flags.CsrDispatchMode.set(static_cast<uint32_t>(DispatchMode::ImmediateDispatch));
DebugManager.flags.SetAmountOfReusableAllocations.set(0);
auto executionEnvironment = getExecutionEnvironmentImpl(hwInfo, 1);
memoryManager = new MockWddmMemoryManager(*executionEnvironment);
@ -109,6 +110,10 @@ struct MockWddmCsr : public WddmCommandStreamReceiver<GfxFamily> {
recordedCommandBuffer = std::unique_ptr<CommandBuffer>(new CommandBuffer(device));
}
void fillReusableAllocationsList() override {
fillReusableAllocationsListCalled++;
}
bool initDirectSubmission() override {
if (callParentInitDirectSubmission) {
return WddmCommandStreamReceiver<GfxFamily>::initDirectSubmission();
@ -134,6 +139,7 @@ struct MockWddmCsr : public WddmCommandStreamReceiver<GfxFamily> {
bool callParentInitDirectSubmission = true;
bool initBlitterDirectSubmission = false;
uint32_t fillReusableAllocationsListCalled = 0;
};
class WddmCommandStreamMockGdiTest : public ::testing::Test {