Copy command buffer into ring buffer

Resolves: NEO-7422

Signed-off-by: Lukasz Jobczyk <lukasz.jobczyk@intel.com>
This commit is contained in:
Lukasz Jobczyk
2023-02-22 07:29:42 +00:00
committed by Compute-Runtime-Automation
parent 6e39b98094
commit 2f5be7a48d
40 changed files with 557 additions and 163 deletions

View File

@@ -65,7 +65,7 @@ CommandContainer::CommandContainer(uint32_t maxNumAggregatedIdds) : CommandConta
numIddsPerBlock = maxNumAggregatedIdds;
}
CommandContainer::ErrorCode CommandContainer::initialize(Device *device, AllocationsList *reusableAllocationList, bool requireHeaps) {
CommandContainer::ErrorCode CommandContainer::initialize(Device *device, AllocationsList *reusableAllocationList, bool requireHeaps, bool createSecondaryCmdBufferInHostMem) {
this->device = device;
this->reusableAllocationList = reusableAllocationList;
size_t alignedSize = alignUp<size_t>(this->getTotalCmdBufferSize(), MemoryConstants::pageSize64k);
@@ -85,6 +85,19 @@ CommandContainer::ErrorCode CommandContainer::initialize(Device *device, Allocat
commandStream->replaceGraphicsAllocation(cmdBufferAllocation);
if (createSecondaryCmdBufferInHostMem) {
this->useSecondaryCommandStream = true;
auto cmdBufferAllocationHost = this->obtainNextCommandBufferAllocation(true);
if (!cmdBufferAllocationHost) {
return ErrorCode::OUT_OF_DEVICE_MEMORY;
}
secondaryCommandStreamForImmediateCmdList = std::make_unique<LinearStream>(cmdBufferAllocationHost->getUnderlyingBuffer(),
alignedSize - cmdBufferReservedSize, this, gfxCoreHelper.getBatchBufferEndSize());
secondaryCommandStreamForImmediateCmdList->replaceGraphicsAllocation(cmdBufferAllocationHost);
cmdBufferAllocations.push_back(cmdBufferAllocationHost);
}
if (!getFlushTaskUsedForImmediate()) {
addToResidencyContainer(cmdBufferAllocation);
}
@@ -139,6 +152,14 @@ void CommandContainer::addToResidencyContainer(GraphicsAllocation *alloc) {
this->residencyContainer.push_back(alloc);
}
bool CommandContainer::swapStreams() {
if (this->useSecondaryCommandStream) {
this->commandStream.swap(this->secondaryCommandStreamForImmediateCmdList);
return true;
}
return false;
}
void CommandContainer::removeDuplicatesFromResidencyContainer() {
std::sort(this->residencyContainer.begin(), this->residencyContainer.end());
this->residencyContainer.erase(std::unique(this->residencyContainer.begin(), this->residencyContainer.end()), this->residencyContainer.end());
@@ -275,14 +296,18 @@ void CommandContainer::handleCmdBufferAllocations(size_t startIndex) {
}
GraphicsAllocation *CommandContainer::obtainNextCommandBufferAllocation() {
return this->obtainNextCommandBufferAllocation(false);
}
GraphicsAllocation *CommandContainer::obtainNextCommandBufferAllocation(bool forceHostMemory) {
forceHostMemory &= this->useSecondaryCommandStream;
GraphicsAllocation *cmdBufferAllocation = nullptr;
if (this->reusableAllocationList) {
size_t alignedSize = alignUp<size_t>(this->getTotalCmdBufferSize(), MemoryConstants::pageSize64k);
cmdBufferAllocation = this->reusableAllocationList->detachAllocation(alignedSize, nullptr, nullptr, AllocationType::COMMAND_BUFFER).release();
cmdBufferAllocation = this->reusableAllocationList->detachAllocation(alignedSize, nullptr, forceHostMemory, nullptr, AllocationType::COMMAND_BUFFER).release();
}
if (!cmdBufferAllocation) {
cmdBufferAllocation = this->allocateCommandBuffer();
cmdBufferAllocation = this->allocateCommandBuffer(forceHostMemory);
}
return cmdBufferAllocation;
@@ -381,10 +406,15 @@ void CommandContainer::reserveSpaceForDispatch(HeapReserveArguments &sshReserveA
}
GraphicsAllocation *CommandContainer::reuseExistingCmdBuffer() {
return this->reuseExistingCmdBuffer(false);
}
GraphicsAllocation *CommandContainer::reuseExistingCmdBuffer(bool forceHostMemory) {
forceHostMemory &= this->useSecondaryCommandStream;
size_t alignedSize = alignUp<size_t>(this->getTotalCmdBufferSize(), MemoryConstants::pageSize64k);
auto cmdBufferAllocation = this->immediateReusableAllocationList->detachAllocation(alignedSize, nullptr, this->immediateCmdListCsr, AllocationType::COMMAND_BUFFER).release();
auto cmdBufferAllocation = this->immediateReusableAllocationList->detachAllocation(alignedSize, nullptr, forceHostMemory, this->immediateCmdListCsr, AllocationType::COMMAND_BUFFER).release();
if (!cmdBufferAllocation) {
this->reusableAllocationList->detachAllocation(alignedSize, nullptr, this->immediateCmdListCsr, AllocationType::COMMAND_BUFFER).release();
this->reusableAllocationList->detachAllocation(alignedSize, nullptr, forceHostMemory, this->immediateCmdListCsr, AllocationType::COMMAND_BUFFER).release();
}
if (cmdBufferAllocation) {
@@ -409,6 +439,10 @@ void CommandContainer::setCmdBuffer(GraphicsAllocation *cmdBuffer) {
}
GraphicsAllocation *CommandContainer::allocateCommandBuffer() {
return this->allocateCommandBuffer(false);
}
GraphicsAllocation *CommandContainer::allocateCommandBuffer(bool forceHostMemory) {
size_t alignedSize = alignUp<size_t>(this->getTotalCmdBufferSize(), MemoryConstants::pageSize64k);
AllocationProperties properties{device->getRootDeviceIndex(),
true /* allocateMemory*/,
@@ -417,6 +451,7 @@ GraphicsAllocation *CommandContainer::allocateCommandBuffer() {
(device->getNumGenericSubDevices() > 1u) /* multiOsContextCapable */,
false,
device->getDeviceBitfield()};
properties.flags.forceSystemMemory = forceHostMemory && this->useSecondaryCommandStream;
return device->getMemoryManager()->allocateGraphicsMemoryWithProperties(properties);
}

View File

@@ -97,7 +97,7 @@ class CommandContainer : public NonCopyableOrMovableClass {
void *getHeapSpaceAllowGrow(HeapType heapType, size_t size);
ErrorCode initialize(Device *device, AllocationsList *reusableAllocationList, bool requireHeaps);
ErrorCode initialize(Device *device, AllocationsList *reusableAllocationList, bool requireHeaps, bool createSecondaryCmdBufferInHostMem);
void prepareBindfulSsh();
@@ -111,6 +111,9 @@ class CommandContainer : public NonCopyableOrMovableClass {
void handleCmdBufferAllocations(size_t startIndex);
GraphicsAllocation *obtainNextCommandBufferAllocation();
GraphicsAllocation *obtainNextCommandBufferAllocation(bool forceHostMemory);
bool swapStreams();
void reset();
@@ -139,7 +142,9 @@ class CommandContainer : public NonCopyableOrMovableClass {
void reserveSpaceForDispatch(HeapReserveArguments &sshReserveArg, HeapReserveArguments &dshReserveArg, bool getDsh);
GraphicsAllocation *reuseExistingCmdBuffer();
GraphicsAllocation *reuseExistingCmdBuffer(bool forceHostMemory);
GraphicsAllocation *allocateCommandBuffer();
GraphicsAllocation *allocateCommandBuffer(bool forceHostMemory);
void setCmdBuffer(GraphicsAllocation *cmdBuffer);
void addCurrentCommandBufferToReusableAllocationList();
@@ -177,6 +182,8 @@ class CommandContainer : public NonCopyableOrMovableClass {
std::unique_ptr<HeapHelper> heapHelper;
std::unique_ptr<LinearStream> commandStream;
std::unique_ptr<LinearStream> secondaryCommandStreamForImmediateCmdList;
bool useSecondaryCommandStream = false;
uint64_t instructionHeapBaseAddress = 0u;
uint64_t indirectObjectHeapBaseAddress = 0u;

View File

@@ -328,6 +328,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionOverrideComputeSupport, -1, "Ove
DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionDisableCacheFlush, -1, "-1: driver default, 0: additional cache flush is present 1: disable dispatching cache flush commands")
DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionNewResourceTlbFlush, -1, "-1: driver default - flush when new resource is bound, 0: disabled, 1: enabled")
DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionDisableMonitorFence, -1, "Disable dispatching monitor fence commands")
DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionFlatRingBuffer, 0, "-1: default, 0: disable, 1: enable, Copies task command buffer directly into ring, implemented for immediate command lists only")
DECLARE_DEBUG_VARIABLE(int32_t, EnableDirectSubmissionController, -1, "Enable direct submission terminating after given timeout, -1: default, 0: disabled, 1: enabled")
DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionControllerTimeout, -1, "Set direct submission controller timeout, -1: default 5000 us, >=0: timeout in us")
DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionControllerDivisor, -1, "Set direct submission controller timeout divider, -1: default 1, >0: divider value")

View File

@@ -113,6 +113,7 @@ class DirectSubmissionHw {
virtual uint64_t updateTagValue() = 0;
virtual void getTagAddressValue(TagData &tagData) = 0;
void unblockGpu();
bool copyCommandBufferIntoRing(BatchBuffer &batchBuffer);
void cpuCachelineFlush(void *ptr, size_t size);

View File

@@ -704,13 +704,22 @@ void *DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchWorkloadSection(BatchBu
relaxedOrderingReturnPtrCmdStream.replaceBuffer(relaxedOrderingReturnPtrCmds, RelaxedOrderingHelper::getSizeReturnPtrRegs<GfxFamily>());
}
dispatchStartSection(commandStreamAddress);
auto copyCmdBuffer = this->copyCommandBufferIntoRing(batchBuffer);
if (copyCmdBuffer) {
auto cmdStreamTaskPtr = ptrOffset(batchBuffer.stream->getCpuBase(), batchBuffer.startOffset);
auto sizeToCopy = ptrDiff(returnCmd, cmdStreamTaskPtr);
auto ringPtr = ringCommandStream.getSpace(sizeToCopy);
memcpy(ringPtr, cmdStreamTaskPtr, sizeToCopy);
} else {
dispatchStartSection(commandStreamAddress);
}
uint64_t returnGpuPointer = ringCommandStream.getCurrentGpuAddressPosition();
if (this->relaxedOrderingEnabled && batchBuffer.hasRelaxedOrderingDependencies) {
dispatchRelaxedOrderingReturnPtrRegs(relaxedOrderingReturnPtrCmdStream, returnGpuPointer);
} else {
} else if (!copyCmdBuffer) {
setReturnAddress(returnCmd, returnGpuPointer);
}
} else if (workloadMode == 1) {
@@ -880,6 +889,21 @@ void DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchTaskStoreSection(uint64_
memcpy_s(dst, RelaxedOrderingHelper::getSizeTaskStoreSection<GfxFamily>(), preinitializedTaskStoreSection.get(), RelaxedOrderingHelper::getSizeTaskStoreSection<GfxFamily>());
}
template <typename GfxFamily, typename Dispatcher>
bool DirectSubmissionHw<GfxFamily, Dispatcher>::copyCommandBufferIntoRing(BatchBuffer &batchBuffer) {
auto ret = this->osContext.getNumSupportedDevices() == 1u &&
!batchBuffer.chainedBatchBuffer &&
batchBuffer.commandBufferAllocation &&
MemoryPoolHelper::isSystemMemoryPool(batchBuffer.commandBufferAllocation->getMemoryPool()) &&
!batchBuffer.hasRelaxedOrderingDependencies;
if (DebugManager.flags.DirectSubmissionFlatRingBuffer.get() != -1) {
ret &= !!DebugManager.flags.DirectSubmissionFlatRingBuffer.get();
}
return ret;
}
template <typename GfxFamily, typename Dispatcher>
bool DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchCommandBuffer(BatchBuffer &batchBuffer, FlushStampTracker &flushStamp) {
// for now workloads requiring cache coherency are not supported
@@ -894,6 +918,11 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchCommandBuffer(BatchBuffe
bool relaxedOrderingSchedulerWillBeNeeded = (this->relaxedOrderingSchedulerRequired || batchBuffer.hasRelaxedOrderingDependencies);
size_t dispatchSize = getSizeDispatch(relaxedOrderingSchedulerWillBeNeeded, batchBuffer.hasRelaxedOrderingDependencies);
if (this->copyCommandBufferIntoRing(batchBuffer)) {
dispatchSize += (batchBuffer.stream->getUsed() - batchBuffer.startOffset) - 2 * getSizeStartSection();
}
size_t cycleSize = getSizeSwitchRingBufferSection();
size_t requiredMinimalSize = dispatchSize + cycleSize + getSizeEnd(relaxedOrderingSchedulerWillBeNeeded);
if (this->relaxedOrderingEnabled) {

View File

@@ -21,6 +21,7 @@ struct ReusableAllocationRequirements {
uint32_t contextId;
uint32_t activeTileCount;
uint32_t tagOffset;
bool forceSystemMemoryFlag;
};
bool checkTagAddressReady(ReusableAllocationRequirements *requirements, NEO::GraphicsAllocation *gfxAllocation) {
@@ -42,6 +43,10 @@ AllocationsList::AllocationsList(AllocationUsage allocationUsage)
: allocationUsage(allocationUsage) {}
std::unique_ptr<GraphicsAllocation> AllocationsList::detachAllocation(size_t requiredMinimalSize, const void *requiredPtr, CommandStreamReceiver *commandStreamReceiver, AllocationType allocationType) {
return this->detachAllocation(requiredMinimalSize, requiredPtr, false, commandStreamReceiver, allocationType);
}
std::unique_ptr<GraphicsAllocation> AllocationsList::detachAllocation(size_t requiredMinimalSize, const void *requiredPtr, bool forceSystemMemoryFlag, CommandStreamReceiver *commandStreamReceiver, AllocationType allocationType) {
ReusableAllocationRequirements req;
req.requiredMinimalSize = requiredMinimalSize;
req.csrTagAddress = (commandStreamReceiver == nullptr) ? nullptr : commandStreamReceiver->getTagAddress();
@@ -50,6 +55,7 @@ std::unique_ptr<GraphicsAllocation> AllocationsList::detachAllocation(size_t req
req.requiredPtr = requiredPtr;
req.activeTileCount = (commandStreamReceiver == nullptr) ? 1u : commandStreamReceiver->getActivePartitions();
req.tagOffset = (commandStreamReceiver == nullptr) ? 0u : commandStreamReceiver->getPostSyncWriteOffset();
req.forceSystemMemoryFlag = forceSystemMemoryFlag;
GraphicsAllocation *a = nullptr;
GraphicsAllocation *retAlloc = processLocked<AllocationsList, &AllocationsList::detachAllocationImpl>(a, static_cast<void *>(&req));
return std::unique_ptr<GraphicsAllocation>(retAlloc);
@@ -60,7 +66,8 @@ GraphicsAllocation *AllocationsList::detachAllocationImpl(GraphicsAllocation *,
auto *curr = head;
while (curr != nullptr) {
if ((req->allocationType == curr->getAllocationType()) &&
(curr->getUnderlyingBufferSize() >= req->requiredMinimalSize)) {
(curr->getUnderlyingBufferSize() >= req->requiredMinimalSize) &&
(curr->storageInfo.systemMemoryForced == req->forceSystemMemoryFlag)) {
if (req->csrTagAddress == nullptr) {
return removeOneImpl(curr, nullptr);
}

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2018-2022 Intel Corporation
* Copyright (C) 2018-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -21,6 +21,7 @@ class AllocationsList : public IDList<GraphicsAllocation, true, true> {
AllocationsList(AllocationUsage allocationUsage);
std::unique_ptr<GraphicsAllocation> detachAllocation(size_t requiredMinimalSize, const void *requiredPtr, CommandStreamReceiver *commandStreamReceiver, AllocationType allocationType);
std::unique_ptr<GraphicsAllocation> detachAllocation(size_t requiredMinimalSize, const void *requiredPtr, bool forceSystemMemoryFlag, CommandStreamReceiver *commandStreamReceiver, AllocationType allocationType);
void freeAllGraphicsAllocations(Device *neoDevice);
private:

View File

@@ -31,6 +31,7 @@ struct StorageInfo {
bool isLockable = false;
bool localOnlyRequired = false;
bool systemMemoryPlacement = true;
bool systemMemoryForced = false;
char resourceTag[AppResourceDefines::maxStrLen + 1] = "";
uint32_t getMemoryBanks() const { return static_cast<uint32_t>(memoryBanks.to_ulong()); }
uint32_t getTotalBanksCnt() const;

View File

@@ -530,6 +530,7 @@ bool MemoryManager::getAllocationData(AllocationData &allocationData, const Allo
allocationData.flags.isUSMHostAllocation = properties.flags.isUSMHostAllocation;
allocationData.storageInfo.systemMemoryPlacement = allocationData.flags.useSystemMemory;
allocationData.storageInfo.systemMemoryForced = properties.flags.forceSystemMemory;
return true;
}

View File

@@ -2003,6 +2003,7 @@ DrmAllocation *DrmMemoryManager::createAllocWithAlignment(const AllocationData &
bo.release();
allocation->isShareableHostMemory = true;
allocation->storageInfo = allocationData.storageInfo;
return allocation.release();
} else {
return createAllocWithAlignmentFromUserptr(allocationData, size, alignment, alignedSize, gpuAddress);