Add method to submit kernel on single subdevice

Signed-off-by: Jobczyk, Lukasz <lukasz.jobczyk@intel.com>
This commit is contained in:
Jobczyk, Lukasz
2020-11-18 13:56:18 +00:00
committed by Compute-Runtime-Automation
parent 00c92c8c14
commit 343fd602fa
23 changed files with 191 additions and 152 deletions

View File

@@ -146,6 +146,8 @@ class CommandStreamReceiverHw : public CommandStreamReceiver {
CsrSizeRequestFlags csrSizeRequestFlags = {};
bool wasSubmittedToSingleSubdevice = false;
std::unique_ptr<DirectSubmissionHw<GfxFamily, RenderDispatcher<GfxFamily>>> directSubmission;
std::unique_ptr<DirectSubmissionHw<GfxFamily, BlitterDispatcher<GfxFamily>>> blitterDirectSubmission;
};

View File

@@ -197,7 +197,11 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
void *currentPipeControlForNooping = nullptr;
void *epiloguePipeControlLocation = nullptr;
if (DebugManager.flags.ForceCsrFlushing.get()) {
bool csrFlush = this->wasSubmittedToSingleSubdevice != dispatchFlags.useSingleSubdevice;
csrFlush |= DebugManager.flags.ForceCsrFlushing.get();
if (csrFlush) {
flushBatchedSubmissions();
}
@@ -544,7 +548,7 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
auto &streamToSubmit = submitCommandStreamFromCsr ? commandStreamCSR : commandStreamTask;
BatchBuffer batchBuffer{streamToSubmit.getGraphicsAllocation(), startOffset, chainedBatchBufferStartOffset, chainedBatchBuffer,
dispatchFlags.requiresCoherency, dispatchFlags.lowPriority, dispatchFlags.throttle, dispatchFlags.sliceCount,
streamToSubmit.getUsed(), &streamToSubmit, bbEndLocation};
streamToSubmit.getUsed(), &streamToSubmit, bbEndLocation, dispatchFlags.useSingleSubdevice};
if (submitCSR | submitTask) {
if (this->dispatchMode == DispatchMode::ImmediateDispatch) {
@@ -566,6 +570,8 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
this->makeSurfacePackNonResident(this->getResidencyAllocations());
}
this->wasSubmittedToSingleSubdevice = dispatchFlags.useSingleSubdevice;
//check if we are not over the budget, if we are do implicit flush
if (getMemoryManager()->isMemoryBudgetExhausted()) {
if (this->totalMemoryUsed >= device.getDeviceInfo().globalMemSize / 4) {
@@ -1027,7 +1033,7 @@ uint32_t CommandStreamReceiverHw<GfxFamily>::blitBuffer(const BlitPropertiesCont
}
BatchBuffer batchBuffer{commandStream.getGraphicsAllocation(), commandStreamStart, 0, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount,
commandStream.getUsed(), &commandStream, endingCmdPtr};
commandStream.getUsed(), &commandStream, endingCmdPtr, false};
flush(batchBuffer, getResidencyAllocations());
makeSurfacePackNonResident(getResidencyAllocations());

View File

@@ -55,29 +55,31 @@ struct DispatchFlags {
uint32_t l3CacheSettings, uint32_t threadArbitrationPolicy, uint32_t additionalKernelExecInfo, KernelExecutionType kernelExecutionType, uint64_t sliceCount, bool blocking, bool dcFlush,
bool useSLM, bool guardCommandBufferWithPipeControl, bool gsba32BitRequired,
bool requiresCoherency, bool lowPriority, bool implicitFlush,
bool outOfOrderExecutionAllowed, bool epilogueRequired, bool usePerDSSbackedBuffer) : csrDependencies(csrDependencies),
barrierTimestampPacketNodes(barrierTimestampPacketNodes),
pipelineSelectArgs(pipelineSelectArgs),
flushStampReference(flushStampReference),
throttle(throttle),
preemptionMode(preemptionMode),
numGrfRequired(numGrfRequired),
l3CacheSettings(l3CacheSettings),
threadArbitrationPolicy(threadArbitrationPolicy),
additionalKernelExecInfo(additionalKernelExecInfo),
kernelExecutionType(kernelExecutionType),
sliceCount(sliceCount),
blocking(blocking),
dcFlush(dcFlush),
useSLM(useSLM),
guardCommandBufferWithPipeControl(guardCommandBufferWithPipeControl),
gsba32BitRequired(gsba32BitRequired),
requiresCoherency(requiresCoherency),
lowPriority(lowPriority),
implicitFlush(implicitFlush),
outOfOrderExecutionAllowed(outOfOrderExecutionAllowed),
epilogueRequired(epilogueRequired),
usePerDssBackedBuffer(usePerDSSbackedBuffer){};
bool outOfOrderExecutionAllowed, bool epilogueRequired, bool usePerDSSbackedBuffer, bool useSingleSubdevice) : csrDependencies(csrDependencies),
barrierTimestampPacketNodes(barrierTimestampPacketNodes),
pipelineSelectArgs(pipelineSelectArgs),
flushStampReference(flushStampReference),
throttle(throttle),
preemptionMode(preemptionMode),
numGrfRequired(numGrfRequired),
l3CacheSettings(l3CacheSettings),
threadArbitrationPolicy(threadArbitrationPolicy),
additionalKernelExecInfo(additionalKernelExecInfo),
kernelExecutionType(kernelExecutionType),
sliceCount(sliceCount),
blocking(blocking),
dcFlush(dcFlush),
useSLM(useSLM),
guardCommandBufferWithPipeControl(guardCommandBufferWithPipeControl),
gsba32BitRequired(gsba32BitRequired),
requiresCoherency(requiresCoherency),
lowPriority(lowPriority),
implicitFlush(implicitFlush),
outOfOrderExecutionAllowed(outOfOrderExecutionAllowed),
epilogueRequired(epilogueRequired),
usePerDssBackedBuffer(usePerDSSbackedBuffer),
useSingleSubdevice(useSingleSubdevice){};
CsrDependencies csrDependencies;
TimestampPacketContainer *barrierTimestampPacketNodes = nullptr;
PipelineSelectArgs pipelineSelectArgs;
@@ -102,6 +104,7 @@ struct DispatchFlags {
bool outOfOrderExecutionAllowed = false;
bool epilogueRequired = false;
bool usePerDssBackedBuffer = false;
bool useSingleSubdevice = false;
};
struct CsrSizeRequestFlags {

View File

@@ -102,12 +102,12 @@ NEO::BatchBuffer::BatchBuffer(GraphicsAllocation *commandBufferAllocation, size_
size_t chainedBatchBufferStartOffset, GraphicsAllocation *chainedBatchBuffer,
bool requiresCoherency, bool lowPriority,
QueueThrottle throttle, uint64_t sliceCount,
size_t usedSize, LinearStream *stream, void *endCmdPtr)
size_t usedSize, LinearStream *stream, void *endCmdPtr, bool useSingleSubdevice)
: commandBufferAllocation(commandBufferAllocation), startOffset(startOffset),
chainedBatchBufferStartOffset(chainedBatchBufferStartOffset), chainedBatchBuffer(chainedBatchBuffer),
requiresCoherency(requiresCoherency), low_priority(lowPriority),
throttle(throttle), sliceCount(sliceCount),
usedSize(usedSize), stream(stream), endCmdPtr(endCmdPtr) {}
usedSize(usedSize), stream(stream), endCmdPtr(endCmdPtr), useSingleSubdevice(useSingleSubdevice) {}
NEO::CommandBuffer::CommandBuffer(Device &device) : device(device) {
flushStamp.reset(new FlushStampTracker(false));

View File

@@ -30,7 +30,8 @@ struct BatchBuffer {
uint64_t sliceCount,
size_t usedSize,
LinearStream *stream,
void *endCmdPtr);
void *endCmdPtr,
bool useSingleSubdevice);
BatchBuffer() {}
GraphicsAllocation *commandBufferAllocation = nullptr;
size_t startOffset = 0u;
@@ -45,6 +46,8 @@ struct BatchBuffer {
//only used in drm csr in gem close worker active mode
LinearStream *stream = nullptr;
void *endCmdPtr = nullptr;
bool useSingleSubdevice = false;
};
struct CommandBuffer : public IDNode<CommandBuffer> {