mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-31 20:13:04 +08:00
[26/n] Internal 4GB allocator.
- change the way we handle blocked commands. - instead of allocating CPU pointer and populating it with commands, create real IndirectHeap that may be later submitted to the GPU - that removes a lot of copy operations that were happening on submit time - for device enqueue, this requires dsh & shh to be passed directly to the underlying commands, in that scenario device queue buffers are not used Change-Id: I1124a8edbb46777ea7f7d3a5946f302e7fdf9665
This commit is contained in:
committed by
sys_ocldev
parent
100f559daa
commit
ffa9b097f5
@@ -239,35 +239,7 @@ IndirectHeap &CommandQueue::getIndirectHeap(IndirectHeap::Type heapType,
|
||||
}
|
||||
|
||||
if (!heapMemory) {
|
||||
size_t reservedSize = 0;
|
||||
auto finalHeapSize = defaultHeapSize;
|
||||
|
||||
minRequiredSize += reservedSize;
|
||||
|
||||
finalHeapSize = alignUp(std::max(finalHeapSize, minRequiredSize), MemoryConstants::pageSize);
|
||||
|
||||
heapMemory = memoryManager->obtainReusableAllocation(finalHeapSize).release();
|
||||
|
||||
if (!heapMemory) {
|
||||
heapMemory = memoryManager->allocateGraphicsMemory(finalHeapSize, MemoryConstants::pageSize);
|
||||
} else {
|
||||
finalHeapSize = std::max(heapMemory->getUnderlyingBufferSize(), finalHeapSize);
|
||||
}
|
||||
|
||||
heapMemory->setAllocationType(GraphicsAllocation::ALLOCATION_TYPE_LINEAR_STREAM);
|
||||
|
||||
if (IndirectHeap::SURFACE_STATE == heapType) {
|
||||
DEBUG_BREAK_IF(minRequiredSize > maxSshSize);
|
||||
finalHeapSize = maxSshSize;
|
||||
}
|
||||
|
||||
if (heap) {
|
||||
heap->replaceBuffer(heapMemory->getUnderlyingBuffer(), finalHeapSize);
|
||||
heap->replaceGraphicsAllocation(heapMemory);
|
||||
} else {
|
||||
heap = new IndirectHeap(heapMemory);
|
||||
heap->overrideMaxSize(finalHeapSize);
|
||||
}
|
||||
allocateHeapMemory(heapType, minRequiredSize, heap);
|
||||
}
|
||||
|
||||
return *heap;
|
||||
@@ -650,4 +622,37 @@ bool CommandQueue::setupDebugSurface(Kernel *kernel) {
|
||||
return true;
|
||||
}
|
||||
|
||||
void CommandQueue::allocateHeapMemory(IndirectHeap::Type heapType,
|
||||
size_t minRequiredSize, IndirectHeap *&indirectHeap) {
|
||||
auto memoryManager = device->getMemoryManager();
|
||||
size_t reservedSize = 0;
|
||||
auto finalHeapSize = defaultHeapSize;
|
||||
|
||||
minRequiredSize += reservedSize;
|
||||
|
||||
finalHeapSize = alignUp(std::max(finalHeapSize, minRequiredSize), MemoryConstants::pageSize);
|
||||
|
||||
auto heapMemory = memoryManager->obtainReusableAllocation(finalHeapSize).release();
|
||||
|
||||
if (!heapMemory) {
|
||||
heapMemory = memoryManager->allocateGraphicsMemory(finalHeapSize, MemoryConstants::pageSize);
|
||||
} else {
|
||||
finalHeapSize = std::max(heapMemory->getUnderlyingBufferSize(), finalHeapSize);
|
||||
}
|
||||
|
||||
heapMemory->setAllocationType(GraphicsAllocation::ALLOCATION_TYPE_LINEAR_STREAM);
|
||||
|
||||
if (IndirectHeap::SURFACE_STATE == heapType) {
|
||||
DEBUG_BREAK_IF(minRequiredSize > maxSshSize);
|
||||
finalHeapSize = maxSshSize;
|
||||
}
|
||||
|
||||
if (indirectHeap) {
|
||||
indirectHeap->replaceBuffer(heapMemory->getUnderlyingBuffer(), finalHeapSize);
|
||||
indirectHeap->replaceGraphicsAllocation(heapMemory);
|
||||
} else {
|
||||
indirectHeap = new IndirectHeap(heapMemory);
|
||||
indirectHeap->overrideMaxSize(finalHeapSize);
|
||||
}
|
||||
}
|
||||
} // namespace OCLRT
|
||||
|
||||
@@ -336,6 +336,9 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
|
||||
IndirectHeap &getIndirectHeap(IndirectHeap::Type heapType,
|
||||
size_t minRequiredSize = 0u);
|
||||
|
||||
void allocateHeapMemory(IndirectHeap::Type heapType,
|
||||
size_t minRequiredSize, IndirectHeap *&indirectHeap);
|
||||
|
||||
MOCKABLE_VIRTUAL void releaseIndirectHeap(IndirectHeap::Type heapType);
|
||||
|
||||
cl_command_queue_properties getCommandQueueProperties() const {
|
||||
|
||||
@@ -275,6 +275,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
|
||||
|
||||
uint32_t taskCount = commandStreamReceiver.peekTaskCount() + 1;
|
||||
devQueueHw->setupExecutionModelDispatch(getIndirectHeap(IndirectHeap::SURFACE_STATE, minSizeSSHForEM),
|
||||
*devQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE),
|
||||
multiDispatchInfo.begin()->getKernel(),
|
||||
(uint32_t)multiDispatchInfo.size(),
|
||||
taskCount,
|
||||
@@ -297,7 +298,9 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
|
||||
*this,
|
||||
*devQueueHw,
|
||||
preemption,
|
||||
scheduler);
|
||||
scheduler,
|
||||
&getIndirectHeap(IndirectHeap::SURFACE_STATE),
|
||||
devQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE));
|
||||
|
||||
scheduler.makeResident(commandStreamReceiver);
|
||||
|
||||
|
||||
@@ -119,13 +119,6 @@ inline cl_uint computeDimensions(const size_t workItems[3]) {
|
||||
return (workItems[2] > 1) ? 3 : (workItems[1] > 1) ? 2 : 1;
|
||||
}
|
||||
|
||||
template <typename SizeAndAllocCalcT, typename... CalcArgsT>
|
||||
IndirectHeap *allocateIndirectHeap(SizeAndAllocCalcT &&calc, CalcArgsT &&... args) {
|
||||
size_t alignment = MemoryConstants::pageSize;
|
||||
size_t size = calc(std::forward<CalcArgsT>(args)...);
|
||||
return new IndirectHeap(alignedMalloc(size, alignment), size);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
class GpgpuWalkerHelper {
|
||||
public:
|
||||
@@ -227,7 +220,9 @@ class GpgpuWalkerHelper {
|
||||
CommandQueue &commandQueue,
|
||||
DeviceQueueHw<GfxFamily> &devQueueHw,
|
||||
PreemptionMode preemptionMode,
|
||||
SchedulerKernel &scheduler);
|
||||
SchedulerKernel &scheduler,
|
||||
IndirectHeap *ssh,
|
||||
IndirectHeap *dsh);
|
||||
};
|
||||
|
||||
template <typename GfxFamily, uint32_t eventType>
|
||||
|
||||
@@ -458,20 +458,27 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchWalker(
|
||||
using KCH = KernelCommandsHelper<GfxFamily>;
|
||||
commandStream = new LinearStream(alignedMalloc(MemoryConstants::pageSize, MemoryConstants::pageSize), MemoryConstants::pageSize);
|
||||
if (executionModelKernel) {
|
||||
uint32_t offsetDsh = commandQueue.getContext().getDefaultDeviceQueue()->getDshOffset();
|
||||
uint32_t colorCalcSize = commandQueue.getContext().getDefaultDeviceQueue()->colorCalcStateSize;
|
||||
|
||||
dsh = allocateIndirectHeap([&multiDispatchInfo, offsetDsh] { return KCH::getTotalSizeRequiredDSH(multiDispatchInfo) + KCH::getTotalSizeRequiredIOH(multiDispatchInfo) + offsetDsh; });
|
||||
commandQueue.allocateHeapMemory(IndirectHeap::DYNAMIC_STATE,
|
||||
commandQueue.getContext().getDefaultDeviceQueue()->getDshBuffer()->getUnderlyingBufferSize(),
|
||||
dsh);
|
||||
|
||||
dsh->getSpace(colorCalcSize);
|
||||
ioh = dsh;
|
||||
commandQueue.allocateHeapMemory(IndirectHeap::SURFACE_STATE,
|
||||
KernelCommandsHelper<GfxFamily>::template getSizeRequiredForExecutionModel<IndirectHeap::SURFACE_STATE>(*(multiDispatchInfo.begin()->getKernel())) +
|
||||
KCH::getTotalSizeRequiredSSH(multiDispatchInfo),
|
||||
ssh);
|
||||
} else {
|
||||
dsh = allocateIndirectHeap([&multiDispatchInfo] { return KCH::getTotalSizeRequiredDSH(multiDispatchInfo); });
|
||||
ioh = allocateIndirectHeap([&multiDispatchInfo] { return KCH::getTotalSizeRequiredIOH(multiDispatchInfo); });
|
||||
commandQueue.allocateHeapMemory(IndirectHeap::DYNAMIC_STATE, KCH::getTotalSizeRequiredDSH(multiDispatchInfo), dsh);
|
||||
commandQueue.allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT, KCH::getTotalSizeRequiredIOH(multiDispatchInfo), ioh);
|
||||
commandQueue.allocateHeapMemory(IndirectHeap::SURFACE_STATE, KCH::getTotalSizeRequiredSSH(multiDispatchInfo), ssh);
|
||||
}
|
||||
|
||||
ssh = allocateIndirectHeap([&multiDispatchInfo] { return KCH::getTotalSizeRequiredSSH(multiDispatchInfo); });
|
||||
using UniqueIH = std::unique_ptr<IndirectHeap>;
|
||||
*blockedCommandsData = new KernelOperation(std::unique_ptr<LinearStream>(commandStream), UniqueIH(dsh), UniqueIH(ioh), UniqueIH(ssh));
|
||||
*blockedCommandsData = new KernelOperation(std::unique_ptr<LinearStream>(commandStream), UniqueIH(dsh), UniqueIH(ioh), UniqueIH(ssh),
|
||||
*commandQueue.getDevice().getMemoryManager());
|
||||
if (executionModelKernel) {
|
||||
(*blockedCommandsData)->doNotFreeISH = true;
|
||||
}
|
||||
@@ -671,7 +678,9 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
|
||||
CommandQueue &commandQueue,
|
||||
DeviceQueueHw<GfxFamily> &devQueueHw,
|
||||
PreemptionMode preemptionMode,
|
||||
SchedulerKernel &scheduler) {
|
||||
SchedulerKernel &scheduler,
|
||||
IndirectHeap *ssh,
|
||||
IndirectHeap *dsh) {
|
||||
|
||||
using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;
|
||||
using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER;
|
||||
@@ -679,13 +688,9 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
|
||||
using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START;
|
||||
|
||||
OCLRT::LinearStream *commandStream = nullptr;
|
||||
OCLRT::IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr;
|
||||
OCLRT::IndirectHeap *ioh = nullptr;
|
||||
|
||||
commandStream = &commandQueue.getCS(0);
|
||||
// note : below code assumes that caller to dispatchScheduler "preallocated" memory
|
||||
// required for execution model in below heap managers
|
||||
dsh = devQueueHw.getIndirectHeap(IndirectHeap::DYNAMIC_STATE);
|
||||
ssh = &commandQueue.getIndirectHeap(IndirectHeap::SURFACE_STATE);
|
||||
|
||||
bool dcFlush = false;
|
||||
commandQueue.getDevice().getCommandStreamReceiver().addPipeControl(*commandStream, dcFlush);
|
||||
|
||||
Reference in New Issue
Block a user