mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-31 12:11:31 +08:00
Copy command buffer into ring buffer
Resolves: NEO-7422 Signed-off-by: Lukasz Jobczyk <lukasz.jobczyk@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
6e39b98094
commit
2f5be7a48d
@@ -65,7 +65,7 @@ CommandContainer::CommandContainer(uint32_t maxNumAggregatedIdds) : CommandConta
|
||||
numIddsPerBlock = maxNumAggregatedIdds;
|
||||
}
|
||||
|
||||
CommandContainer::ErrorCode CommandContainer::initialize(Device *device, AllocationsList *reusableAllocationList, bool requireHeaps) {
|
||||
CommandContainer::ErrorCode CommandContainer::initialize(Device *device, AllocationsList *reusableAllocationList, bool requireHeaps, bool createSecondaryCmdBufferInHostMem) {
|
||||
this->device = device;
|
||||
this->reusableAllocationList = reusableAllocationList;
|
||||
size_t alignedSize = alignUp<size_t>(this->getTotalCmdBufferSize(), MemoryConstants::pageSize64k);
|
||||
@@ -85,6 +85,19 @@ CommandContainer::ErrorCode CommandContainer::initialize(Device *device, Allocat
|
||||
|
||||
commandStream->replaceGraphicsAllocation(cmdBufferAllocation);
|
||||
|
||||
if (createSecondaryCmdBufferInHostMem) {
|
||||
this->useSecondaryCommandStream = true;
|
||||
|
||||
auto cmdBufferAllocationHost = this->obtainNextCommandBufferAllocation(true);
|
||||
if (!cmdBufferAllocationHost) {
|
||||
return ErrorCode::OUT_OF_DEVICE_MEMORY;
|
||||
}
|
||||
secondaryCommandStreamForImmediateCmdList = std::make_unique<LinearStream>(cmdBufferAllocationHost->getUnderlyingBuffer(),
|
||||
alignedSize - cmdBufferReservedSize, this, gfxCoreHelper.getBatchBufferEndSize());
|
||||
secondaryCommandStreamForImmediateCmdList->replaceGraphicsAllocation(cmdBufferAllocationHost);
|
||||
cmdBufferAllocations.push_back(cmdBufferAllocationHost);
|
||||
}
|
||||
|
||||
if (!getFlushTaskUsedForImmediate()) {
|
||||
addToResidencyContainer(cmdBufferAllocation);
|
||||
}
|
||||
@@ -139,6 +152,14 @@ void CommandContainer::addToResidencyContainer(GraphicsAllocation *alloc) {
|
||||
this->residencyContainer.push_back(alloc);
|
||||
}
|
||||
|
||||
bool CommandContainer::swapStreams() {
|
||||
if (this->useSecondaryCommandStream) {
|
||||
this->commandStream.swap(this->secondaryCommandStreamForImmediateCmdList);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void CommandContainer::removeDuplicatesFromResidencyContainer() {
|
||||
std::sort(this->residencyContainer.begin(), this->residencyContainer.end());
|
||||
this->residencyContainer.erase(std::unique(this->residencyContainer.begin(), this->residencyContainer.end()), this->residencyContainer.end());
|
||||
@@ -275,14 +296,18 @@ void CommandContainer::handleCmdBufferAllocations(size_t startIndex) {
|
||||
}
|
||||
|
||||
GraphicsAllocation *CommandContainer::obtainNextCommandBufferAllocation() {
|
||||
return this->obtainNextCommandBufferAllocation(false);
|
||||
}
|
||||
|
||||
GraphicsAllocation *CommandContainer::obtainNextCommandBufferAllocation(bool forceHostMemory) {
|
||||
forceHostMemory &= this->useSecondaryCommandStream;
|
||||
GraphicsAllocation *cmdBufferAllocation = nullptr;
|
||||
if (this->reusableAllocationList) {
|
||||
size_t alignedSize = alignUp<size_t>(this->getTotalCmdBufferSize(), MemoryConstants::pageSize64k);
|
||||
cmdBufferAllocation = this->reusableAllocationList->detachAllocation(alignedSize, nullptr, nullptr, AllocationType::COMMAND_BUFFER).release();
|
||||
cmdBufferAllocation = this->reusableAllocationList->detachAllocation(alignedSize, nullptr, forceHostMemory, nullptr, AllocationType::COMMAND_BUFFER).release();
|
||||
}
|
||||
if (!cmdBufferAllocation) {
|
||||
cmdBufferAllocation = this->allocateCommandBuffer();
|
||||
cmdBufferAllocation = this->allocateCommandBuffer(forceHostMemory);
|
||||
}
|
||||
|
||||
return cmdBufferAllocation;
|
||||
@@ -381,10 +406,15 @@ void CommandContainer::reserveSpaceForDispatch(HeapReserveArguments &sshReserveA
|
||||
}
|
||||
|
||||
GraphicsAllocation *CommandContainer::reuseExistingCmdBuffer() {
|
||||
return this->reuseExistingCmdBuffer(false);
|
||||
}
|
||||
|
||||
GraphicsAllocation *CommandContainer::reuseExistingCmdBuffer(bool forceHostMemory) {
|
||||
forceHostMemory &= this->useSecondaryCommandStream;
|
||||
size_t alignedSize = alignUp<size_t>(this->getTotalCmdBufferSize(), MemoryConstants::pageSize64k);
|
||||
auto cmdBufferAllocation = this->immediateReusableAllocationList->detachAllocation(alignedSize, nullptr, this->immediateCmdListCsr, AllocationType::COMMAND_BUFFER).release();
|
||||
auto cmdBufferAllocation = this->immediateReusableAllocationList->detachAllocation(alignedSize, nullptr, forceHostMemory, this->immediateCmdListCsr, AllocationType::COMMAND_BUFFER).release();
|
||||
if (!cmdBufferAllocation) {
|
||||
this->reusableAllocationList->detachAllocation(alignedSize, nullptr, this->immediateCmdListCsr, AllocationType::COMMAND_BUFFER).release();
|
||||
this->reusableAllocationList->detachAllocation(alignedSize, nullptr, forceHostMemory, this->immediateCmdListCsr, AllocationType::COMMAND_BUFFER).release();
|
||||
}
|
||||
|
||||
if (cmdBufferAllocation) {
|
||||
@@ -409,6 +439,10 @@ void CommandContainer::setCmdBuffer(GraphicsAllocation *cmdBuffer) {
|
||||
}
|
||||
|
||||
GraphicsAllocation *CommandContainer::allocateCommandBuffer() {
|
||||
return this->allocateCommandBuffer(false);
|
||||
}
|
||||
|
||||
GraphicsAllocation *CommandContainer::allocateCommandBuffer(bool forceHostMemory) {
|
||||
size_t alignedSize = alignUp<size_t>(this->getTotalCmdBufferSize(), MemoryConstants::pageSize64k);
|
||||
AllocationProperties properties{device->getRootDeviceIndex(),
|
||||
true /* allocateMemory*/,
|
||||
@@ -417,6 +451,7 @@ GraphicsAllocation *CommandContainer::allocateCommandBuffer() {
|
||||
(device->getNumGenericSubDevices() > 1u) /* multiOsContextCapable */,
|
||||
false,
|
||||
device->getDeviceBitfield()};
|
||||
properties.flags.forceSystemMemory = forceHostMemory && this->useSecondaryCommandStream;
|
||||
|
||||
return device->getMemoryManager()->allocateGraphicsMemoryWithProperties(properties);
|
||||
}
|
||||
|
||||
@@ -97,7 +97,7 @@ class CommandContainer : public NonCopyableOrMovableClass {
|
||||
|
||||
void *getHeapSpaceAllowGrow(HeapType heapType, size_t size);
|
||||
|
||||
ErrorCode initialize(Device *device, AllocationsList *reusableAllocationList, bool requireHeaps);
|
||||
ErrorCode initialize(Device *device, AllocationsList *reusableAllocationList, bool requireHeaps, bool createSecondaryCmdBufferInHostMem);
|
||||
|
||||
void prepareBindfulSsh();
|
||||
|
||||
@@ -111,6 +111,9 @@ class CommandContainer : public NonCopyableOrMovableClass {
|
||||
|
||||
void handleCmdBufferAllocations(size_t startIndex);
|
||||
GraphicsAllocation *obtainNextCommandBufferAllocation();
|
||||
GraphicsAllocation *obtainNextCommandBufferAllocation(bool forceHostMemory);
|
||||
|
||||
bool swapStreams();
|
||||
|
||||
void reset();
|
||||
|
||||
@@ -139,7 +142,9 @@ class CommandContainer : public NonCopyableOrMovableClass {
|
||||
void reserveSpaceForDispatch(HeapReserveArguments &sshReserveArg, HeapReserveArguments &dshReserveArg, bool getDsh);
|
||||
|
||||
GraphicsAllocation *reuseExistingCmdBuffer();
|
||||
GraphicsAllocation *reuseExistingCmdBuffer(bool forceHostMemory);
|
||||
GraphicsAllocation *allocateCommandBuffer();
|
||||
GraphicsAllocation *allocateCommandBuffer(bool forceHostMemory);
|
||||
void setCmdBuffer(GraphicsAllocation *cmdBuffer);
|
||||
void addCurrentCommandBufferToReusableAllocationList();
|
||||
|
||||
@@ -177,6 +182,8 @@ class CommandContainer : public NonCopyableOrMovableClass {
|
||||
|
||||
std::unique_ptr<HeapHelper> heapHelper;
|
||||
std::unique_ptr<LinearStream> commandStream;
|
||||
std::unique_ptr<LinearStream> secondaryCommandStreamForImmediateCmdList;
|
||||
bool useSecondaryCommandStream = false;
|
||||
|
||||
uint64_t instructionHeapBaseAddress = 0u;
|
||||
uint64_t indirectObjectHeapBaseAddress = 0u;
|
||||
|
||||
@@ -328,6 +328,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionOverrideComputeSupport, -1, "Ove
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionDisableCacheFlush, -1, "-1: driver default, 0: additional cache flush is present 1: disable dispatching cache flush commands")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionNewResourceTlbFlush, -1, "-1: driver default - flush when new resource is bound, 0: disabled, 1: enabled")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionDisableMonitorFence, -1, "Disable dispatching monitor fence commands")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionFlatRingBuffer, 0, "-1: default, 0: disable, 1: enable, Copies task command buffer directly into ring, implemented for immediate command lists only")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, EnableDirectSubmissionController, -1, "Enable direct submission terminating after given timeout, -1: default, 0: disabled, 1: enabled")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionControllerTimeout, -1, "Set direct submission controller timeout, -1: default 5000 us, >=0: timeout in us")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionControllerDivisor, -1, "Set direct submission controller timeout divider, -1: default 1, >0: divider value")
|
||||
|
||||
@@ -113,6 +113,7 @@ class DirectSubmissionHw {
|
||||
virtual uint64_t updateTagValue() = 0;
|
||||
virtual void getTagAddressValue(TagData &tagData) = 0;
|
||||
void unblockGpu();
|
||||
bool copyCommandBufferIntoRing(BatchBuffer &batchBuffer);
|
||||
|
||||
void cpuCachelineFlush(void *ptr, size_t size);
|
||||
|
||||
|
||||
@@ -704,13 +704,22 @@ void *DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchWorkloadSection(BatchBu
|
||||
relaxedOrderingReturnPtrCmdStream.replaceBuffer(relaxedOrderingReturnPtrCmds, RelaxedOrderingHelper::getSizeReturnPtrRegs<GfxFamily>());
|
||||
}
|
||||
|
||||
dispatchStartSection(commandStreamAddress);
|
||||
auto copyCmdBuffer = this->copyCommandBufferIntoRing(batchBuffer);
|
||||
|
||||
if (copyCmdBuffer) {
|
||||
auto cmdStreamTaskPtr = ptrOffset(batchBuffer.stream->getCpuBase(), batchBuffer.startOffset);
|
||||
auto sizeToCopy = ptrDiff(returnCmd, cmdStreamTaskPtr);
|
||||
auto ringPtr = ringCommandStream.getSpace(sizeToCopy);
|
||||
memcpy(ringPtr, cmdStreamTaskPtr, sizeToCopy);
|
||||
} else {
|
||||
dispatchStartSection(commandStreamAddress);
|
||||
}
|
||||
|
||||
uint64_t returnGpuPointer = ringCommandStream.getCurrentGpuAddressPosition();
|
||||
|
||||
if (this->relaxedOrderingEnabled && batchBuffer.hasRelaxedOrderingDependencies) {
|
||||
dispatchRelaxedOrderingReturnPtrRegs(relaxedOrderingReturnPtrCmdStream, returnGpuPointer);
|
||||
} else {
|
||||
} else if (!copyCmdBuffer) {
|
||||
setReturnAddress(returnCmd, returnGpuPointer);
|
||||
}
|
||||
} else if (workloadMode == 1) {
|
||||
@@ -880,6 +889,21 @@ void DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchTaskStoreSection(uint64_
|
||||
memcpy_s(dst, RelaxedOrderingHelper::getSizeTaskStoreSection<GfxFamily>(), preinitializedTaskStoreSection.get(), RelaxedOrderingHelper::getSizeTaskStoreSection<GfxFamily>());
|
||||
}
|
||||
|
||||
template <typename GfxFamily, typename Dispatcher>
|
||||
bool DirectSubmissionHw<GfxFamily, Dispatcher>::copyCommandBufferIntoRing(BatchBuffer &batchBuffer) {
|
||||
auto ret = this->osContext.getNumSupportedDevices() == 1u &&
|
||||
!batchBuffer.chainedBatchBuffer &&
|
||||
batchBuffer.commandBufferAllocation &&
|
||||
MemoryPoolHelper::isSystemMemoryPool(batchBuffer.commandBufferAllocation->getMemoryPool()) &&
|
||||
!batchBuffer.hasRelaxedOrderingDependencies;
|
||||
|
||||
if (DebugManager.flags.DirectSubmissionFlatRingBuffer.get() != -1) {
|
||||
ret &= !!DebugManager.flags.DirectSubmissionFlatRingBuffer.get();
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
template <typename GfxFamily, typename Dispatcher>
|
||||
bool DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchCommandBuffer(BatchBuffer &batchBuffer, FlushStampTracker &flushStamp) {
|
||||
// for now workloads requiring cache coherency are not supported
|
||||
@@ -894,6 +918,11 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchCommandBuffer(BatchBuffe
|
||||
bool relaxedOrderingSchedulerWillBeNeeded = (this->relaxedOrderingSchedulerRequired || batchBuffer.hasRelaxedOrderingDependencies);
|
||||
|
||||
size_t dispatchSize = getSizeDispatch(relaxedOrderingSchedulerWillBeNeeded, batchBuffer.hasRelaxedOrderingDependencies);
|
||||
|
||||
if (this->copyCommandBufferIntoRing(batchBuffer)) {
|
||||
dispatchSize += (batchBuffer.stream->getUsed() - batchBuffer.startOffset) - 2 * getSizeStartSection();
|
||||
}
|
||||
|
||||
size_t cycleSize = getSizeSwitchRingBufferSection();
|
||||
size_t requiredMinimalSize = dispatchSize + cycleSize + getSizeEnd(relaxedOrderingSchedulerWillBeNeeded);
|
||||
if (this->relaxedOrderingEnabled) {
|
||||
|
||||
@@ -21,6 +21,7 @@ struct ReusableAllocationRequirements {
|
||||
uint32_t contextId;
|
||||
uint32_t activeTileCount;
|
||||
uint32_t tagOffset;
|
||||
bool forceSystemMemoryFlag;
|
||||
};
|
||||
|
||||
bool checkTagAddressReady(ReusableAllocationRequirements *requirements, NEO::GraphicsAllocation *gfxAllocation) {
|
||||
@@ -42,6 +43,10 @@ AllocationsList::AllocationsList(AllocationUsage allocationUsage)
|
||||
: allocationUsage(allocationUsage) {}
|
||||
|
||||
std::unique_ptr<GraphicsAllocation> AllocationsList::detachAllocation(size_t requiredMinimalSize, const void *requiredPtr, CommandStreamReceiver *commandStreamReceiver, AllocationType allocationType) {
|
||||
return this->detachAllocation(requiredMinimalSize, requiredPtr, false, commandStreamReceiver, allocationType);
|
||||
}
|
||||
|
||||
std::unique_ptr<GraphicsAllocation> AllocationsList::detachAllocation(size_t requiredMinimalSize, const void *requiredPtr, bool forceSystemMemoryFlag, CommandStreamReceiver *commandStreamReceiver, AllocationType allocationType) {
|
||||
ReusableAllocationRequirements req;
|
||||
req.requiredMinimalSize = requiredMinimalSize;
|
||||
req.csrTagAddress = (commandStreamReceiver == nullptr) ? nullptr : commandStreamReceiver->getTagAddress();
|
||||
@@ -50,6 +55,7 @@ std::unique_ptr<GraphicsAllocation> AllocationsList::detachAllocation(size_t req
|
||||
req.requiredPtr = requiredPtr;
|
||||
req.activeTileCount = (commandStreamReceiver == nullptr) ? 1u : commandStreamReceiver->getActivePartitions();
|
||||
req.tagOffset = (commandStreamReceiver == nullptr) ? 0u : commandStreamReceiver->getPostSyncWriteOffset();
|
||||
req.forceSystemMemoryFlag = forceSystemMemoryFlag;
|
||||
GraphicsAllocation *a = nullptr;
|
||||
GraphicsAllocation *retAlloc = processLocked<AllocationsList, &AllocationsList::detachAllocationImpl>(a, static_cast<void *>(&req));
|
||||
return std::unique_ptr<GraphicsAllocation>(retAlloc);
|
||||
@@ -60,7 +66,8 @@ GraphicsAllocation *AllocationsList::detachAllocationImpl(GraphicsAllocation *,
|
||||
auto *curr = head;
|
||||
while (curr != nullptr) {
|
||||
if ((req->allocationType == curr->getAllocationType()) &&
|
||||
(curr->getUnderlyingBufferSize() >= req->requiredMinimalSize)) {
|
||||
(curr->getUnderlyingBufferSize() >= req->requiredMinimalSize) &&
|
||||
(curr->storageInfo.systemMemoryForced == req->forceSystemMemoryFlag)) {
|
||||
if (req->csrTagAddress == nullptr) {
|
||||
return removeOneImpl(curr, nullptr);
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2018-2022 Intel Corporation
|
||||
* Copyright (C) 2018-2023 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -21,6 +21,7 @@ class AllocationsList : public IDList<GraphicsAllocation, true, true> {
|
||||
AllocationsList(AllocationUsage allocationUsage);
|
||||
|
||||
std::unique_ptr<GraphicsAllocation> detachAllocation(size_t requiredMinimalSize, const void *requiredPtr, CommandStreamReceiver *commandStreamReceiver, AllocationType allocationType);
|
||||
std::unique_ptr<GraphicsAllocation> detachAllocation(size_t requiredMinimalSize, const void *requiredPtr, bool forceSystemMemoryFlag, CommandStreamReceiver *commandStreamReceiver, AllocationType allocationType);
|
||||
void freeAllGraphicsAllocations(Device *neoDevice);
|
||||
|
||||
private:
|
||||
|
||||
@@ -31,6 +31,7 @@ struct StorageInfo {
|
||||
bool isLockable = false;
|
||||
bool localOnlyRequired = false;
|
||||
bool systemMemoryPlacement = true;
|
||||
bool systemMemoryForced = false;
|
||||
char resourceTag[AppResourceDefines::maxStrLen + 1] = "";
|
||||
uint32_t getMemoryBanks() const { return static_cast<uint32_t>(memoryBanks.to_ulong()); }
|
||||
uint32_t getTotalBanksCnt() const;
|
||||
|
||||
@@ -530,6 +530,7 @@ bool MemoryManager::getAllocationData(AllocationData &allocationData, const Allo
|
||||
allocationData.flags.isUSMHostAllocation = properties.flags.isUSMHostAllocation;
|
||||
|
||||
allocationData.storageInfo.systemMemoryPlacement = allocationData.flags.useSystemMemory;
|
||||
allocationData.storageInfo.systemMemoryForced = properties.flags.forceSystemMemory;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -2003,6 +2003,7 @@ DrmAllocation *DrmMemoryManager::createAllocWithAlignment(const AllocationData &
|
||||
|
||||
bo.release();
|
||||
allocation->isShareableHostMemory = true;
|
||||
allocation->storageInfo = allocationData.storageInfo;
|
||||
return allocation.release();
|
||||
} else {
|
||||
return createAllocWithAlignmentFromUserptr(allocationData, size, alignment, alignedSize, gpuAddress);
|
||||
|
||||
Reference in New Issue
Block a user