performance: immediate flush add flushing mechanism to gpu
Related-To: NEO-7808 Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
parent
df62888efc
commit
eb4e7fb2a6
|
@ -27,6 +27,7 @@ class CommandStreamReceiverHw : public CommandStreamReceiver {
|
|||
PipelineSelectArgs pipelineSelectArgs{};
|
||||
size_t estimatedSize = 0;
|
||||
void *endPtr = nullptr;
|
||||
size_t csrStartOffset = 0;
|
||||
|
||||
bool pipelineSelectFullConfigurationNeeded = false;
|
||||
bool pipelineSelectDirty = false;
|
||||
|
@ -262,7 +263,16 @@ class CommandStreamReceiverHw : public CommandStreamReceiver {
|
|||
LinearStream &immediateCommandStream,
|
||||
ImmediateFlushData &flushData);
|
||||
|
||||
inline void handleImmediateFlushAllocationsResidency(Device &device);
|
||||
inline void handleImmediateFlushAllocationsResidency(Device &device,
|
||||
LinearStream &immediateCommandStream,
|
||||
ImmediateFlushData &flushData,
|
||||
LinearStream &csrStream);
|
||||
|
||||
inline CompletionStamp handleImmediateFlushSendBatchBuffer(LinearStream &immediateCommandStream,
|
||||
size_t immediateCommandStreamStart,
|
||||
ImmediateDispatchFlags &dispatchFlags,
|
||||
ImmediateFlushData &flushData,
|
||||
LinearStream &csrStream);
|
||||
|
||||
HeapDirtyState dshState;
|
||||
HeapDirtyState iohState;
|
||||
|
|
|
@ -308,6 +308,7 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushImmediateTask(
|
|||
handleImmediateFlushJumpToImmediate(flushData);
|
||||
|
||||
auto &csrCommandStream = getCS(flushData.estimatedSize);
|
||||
flushData.csrStartOffset = csrCommandStream.getUsed();
|
||||
|
||||
dispatchImmediateFlushPipelineSelectCommand(flushData, csrCommandStream);
|
||||
dispatchImmediateFlushFrontEndCommand(scratchAddress, flushData, device, csrCommandStream);
|
||||
|
@ -318,16 +319,17 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushImmediateTask(
|
|||
dispatchImmediateFlushJumpToImmediateCommand(immediateCommandStream, immediateCommandStreamStart, flushData, csrCommandStream);
|
||||
|
||||
dispatchImmediateFlushClientBufferCommands(dispatchFlags, immediateCommandStream, flushData);
|
||||
this->latestSentTaskCount = taskCount + 1;
|
||||
|
||||
handleImmediateFlushAllocationsResidency(device);
|
||||
handleImmediateFlushAllocationsResidency(device,
|
||||
immediateCommandStream,
|
||||
flushData,
|
||||
csrCommandStream);
|
||||
|
||||
++taskCount;
|
||||
CompletionStamp completionStamp = {
|
||||
this->taskCount,
|
||||
this->taskLevel,
|
||||
flushStamp->peekStamp()};
|
||||
return completionStamp;
|
||||
return handleImmediateFlushSendBatchBuffer(immediateCommandStream,
|
||||
immediateCommandStreamStart,
|
||||
dispatchFlags,
|
||||
flushData,
|
||||
csrCommandStream);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
|
@ -2044,7 +2046,10 @@ void CommandStreamReceiverHw<GfxFamily>::dispatchImmediateFlushOneTimeContextIni
|
|||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void CommandStreamReceiverHw<GfxFamily>::handleImmediateFlushAllocationsResidency(Device &device) {
|
||||
void CommandStreamReceiverHw<GfxFamily>::handleImmediateFlushAllocationsResidency(Device &device,
|
||||
LinearStream &immediateCommandStream,
|
||||
ImmediateFlushData &flushData,
|
||||
LinearStream &csrStream) {
|
||||
this->makeResident(*tagAllocation);
|
||||
|
||||
if (globalFenceAllocation) {
|
||||
|
@ -2058,6 +2063,10 @@ void CommandStreamReceiverHw<GfxFamily>::handleImmediateFlushAllocationsResidenc
|
|||
if (device.getRTMemoryBackedBuffer()) {
|
||||
makeResident(*device.getRTMemoryBackedBuffer());
|
||||
}
|
||||
|
||||
if (flushData.estimatedSize > 0) {
|
||||
makeResident(*csrStream.getGraphicsAllocation());
|
||||
}
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
|
@ -2099,8 +2108,6 @@ void CommandStreamReceiverHw<GfxFamily>::dispatchImmediateFlushClientBufferComma
|
|||
this->taskCount + 1,
|
||||
peekRootDeviceEnvironment(),
|
||||
args);
|
||||
|
||||
this->latestFlushedTaskCount = this->taskCount + 1;
|
||||
}
|
||||
|
||||
makeResident(*immediateCommandStream.getGraphicsAllocation());
|
||||
|
@ -2109,4 +2116,53 @@ void CommandStreamReceiverHw<GfxFamily>::dispatchImmediateFlushClientBufferComma
|
|||
EncodeNoop<GfxFamily>::alignToCacheLine(immediateCommandStream);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
CompletionStamp CommandStreamReceiverHw<GfxFamily>::handleImmediateFlushSendBatchBuffer(LinearStream &immediateCommandStream,
|
||||
size_t immediateCommandStreamStart,
|
||||
ImmediateDispatchFlags &dispatchFlags,
|
||||
ImmediateFlushData &flushData,
|
||||
LinearStream &csrStream) {
|
||||
this->latestSentTaskCount = taskCount + 1;
|
||||
|
||||
bool startFromCsr = flushData.estimatedSize > 0;
|
||||
size_t startOffset = startFromCsr ? flushData.csrStartOffset : immediateCommandStreamStart;
|
||||
auto &streamToSubmit = startFromCsr ? csrStream : immediateCommandStream;
|
||||
GraphicsAllocation *chainedBatchBuffer = startFromCsr ? immediateCommandStream.getGraphicsAllocation() : nullptr;
|
||||
size_t chainedBatchBufferStartOffset = startFromCsr ? csrStream.getUsed() : 0;
|
||||
uint64_t taskStartAddress = immediateCommandStream.getGpuBase() + immediateCommandStreamStart;
|
||||
bool hasStallingCmds = (startFromCsr || dispatchFlags.blockingAppend || dispatchFlags.hasStallingCmds);
|
||||
|
||||
constexpr bool immediateRequiresCoherency = false;
|
||||
constexpr bool immediateLowPriority = false;
|
||||
constexpr QueueThrottle immediateThrottle = QueueThrottle::MEDIUM;
|
||||
constexpr uint64_t immediateSliceCount = QueueSliceCount::defaultSliceCount;
|
||||
|
||||
BatchBuffer batchBuffer{streamToSubmit.getGraphicsAllocation(), startOffset, chainedBatchBufferStartOffset, taskStartAddress, chainedBatchBuffer,
|
||||
immediateRequiresCoherency, immediateLowPriority, immediateThrottle, immediateSliceCount,
|
||||
streamToSubmit.getUsed(), &streamToSubmit, flushData.endPtr, this->getNumClients(), hasStallingCmds,
|
||||
dispatchFlags.hasRelaxedOrderingDependencies};
|
||||
updateStreamTaskCount(streamToSubmit, taskCount + 1);
|
||||
|
||||
auto submissionStatus = flushHandler(batchBuffer, this->getResidencyAllocations());
|
||||
if (submissionStatus != SubmissionStatus::SUCCESS) {
|
||||
--this->latestSentTaskCount;
|
||||
updateStreamTaskCount(streamToSubmit, taskCount);
|
||||
|
||||
CompletionStamp completionStamp = {CompletionStamp::getTaskCountFromSubmissionStatusError(submissionStatus)};
|
||||
return completionStamp;
|
||||
} else {
|
||||
if (dispatchFlags.blockingAppend) {
|
||||
this->latestFlushedTaskCount = this->taskCount + 1;
|
||||
}
|
||||
|
||||
++taskCount;
|
||||
CompletionStamp completionStamp = {
|
||||
this->taskCount,
|
||||
this->taskLevel,
|
||||
flushStamp->peekStamp()};
|
||||
|
||||
return completionStamp;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace NEO
|
||||
|
|
|
@ -141,6 +141,7 @@ struct ImmediateDispatchFlags {
|
|||
void *sshCpuBase = nullptr;
|
||||
bool blockingAppend = false;
|
||||
bool hasRelaxedOrderingDependencies = false;
|
||||
bool hasStallingCmds = false;
|
||||
};
|
||||
|
||||
} // namespace NEO
|
||||
|
|
|
@ -3894,3 +3894,108 @@ HWTEST2_F(CommandStreamReceiverHwTest,
|
|||
startOffset = commandStream.getUsed();
|
||||
EXPECT_EQ(0u, (startOffset % MemoryConstants::cacheLineSize));
|
||||
}
|
||||
|
||||
HWTEST2_F(CommandStreamReceiverHwTest,
|
||||
givenImmediateFlushTaskWhenPreambleIsUsedOrNotThenCsrBufferIsUsedOrImmediateBufferIsUsed,
|
||||
IsAtLeastXeHpCore) {
|
||||
using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER;
|
||||
|
||||
auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver<FamilyType>();
|
||||
commandStreamReceiver.storeMakeResidentAllocations = true;
|
||||
commandStreamReceiver.recordFlusheBatchBuffer = true;
|
||||
|
||||
auto startOffset = commandStream.getUsed();
|
||||
auto immediateListCmdBufferAllocation = commandStream.getGraphicsAllocation();
|
||||
|
||||
*commandStream.getSpaceForCmd<COMPUTE_WALKER>() = FamilyType::cmdInitGpgpuWalker;
|
||||
|
||||
immediateFlushTaskFlags.hasStallingCmds = true;
|
||||
auto completionStamp = commandStreamReceiver.flushImmediateTask(commandStream, startOffset, immediateFlushTaskFlags, *pDevice);
|
||||
|
||||
auto csrCmdBufferAllocation = commandStreamReceiver.commandStream.getGraphicsAllocation();
|
||||
|
||||
TaskCountType currentTaskCountType = 1u;
|
||||
|
||||
EXPECT_EQ(currentTaskCountType, completionStamp.taskCount);
|
||||
EXPECT_EQ(currentTaskCountType, commandStreamReceiver.taskCount);
|
||||
EXPECT_EQ(currentTaskCountType, commandStreamReceiver.latestSentTaskCount);
|
||||
EXPECT_EQ(0u, commandStreamReceiver.latestFlushedTaskCount);
|
||||
|
||||
EXPECT_TRUE(commandStreamReceiver.isMadeResident(csrCmdBufferAllocation, currentTaskCountType));
|
||||
EXPECT_TRUE(commandStreamReceiver.isMadeResident(immediateListCmdBufferAllocation, currentTaskCountType));
|
||||
|
||||
BatchBuffer &recordedBatchBuffer = commandStreamReceiver.latestFlushedBatchBuffer;
|
||||
EXPECT_EQ(csrCmdBufferAllocation, recordedBatchBuffer.commandBufferAllocation);
|
||||
EXPECT_EQ(0u, recordedBatchBuffer.startOffset);
|
||||
EXPECT_EQ(true, recordedBatchBuffer.hasStallingCmds);
|
||||
EXPECT_EQ(false, recordedBatchBuffer.hasRelaxedOrderingDependencies);
|
||||
|
||||
startOffset = commandStream.getUsed();
|
||||
|
||||
*commandStream.getSpaceForCmd<COMPUTE_WALKER>() = FamilyType::cmdInitGpgpuWalker;
|
||||
|
||||
immediateFlushTaskFlags.hasRelaxedOrderingDependencies = true;
|
||||
completionStamp = commandStreamReceiver.flushImmediateTask(commandStream, startOffset, immediateFlushTaskFlags, *pDevice);
|
||||
|
||||
currentTaskCountType = 2u;
|
||||
|
||||
EXPECT_EQ(currentTaskCountType, completionStamp.taskCount);
|
||||
EXPECT_EQ(currentTaskCountType, commandStreamReceiver.taskCount);
|
||||
EXPECT_EQ(currentTaskCountType, commandStreamReceiver.latestSentTaskCount);
|
||||
EXPECT_EQ(0u, commandStreamReceiver.latestFlushedTaskCount);
|
||||
|
||||
EXPECT_FALSE(commandStreamReceiver.isMadeResident(csrCmdBufferAllocation, currentTaskCountType));
|
||||
EXPECT_TRUE(commandStreamReceiver.isMadeResident(immediateListCmdBufferAllocation, currentTaskCountType));
|
||||
|
||||
recordedBatchBuffer = commandStreamReceiver.latestFlushedBatchBuffer;
|
||||
EXPECT_EQ(immediateListCmdBufferAllocation, recordedBatchBuffer.commandBufferAllocation);
|
||||
EXPECT_EQ(startOffset, recordedBatchBuffer.startOffset);
|
||||
EXPECT_EQ(true, recordedBatchBuffer.hasStallingCmds);
|
||||
EXPECT_EQ(true, recordedBatchBuffer.hasRelaxedOrderingDependencies);
|
||||
}
|
||||
|
||||
HWTEST2_F(CommandStreamReceiverHwTest,
|
||||
givenImmediateFlushTaskWhenFlushOperationFailsThenExpectNoBatchBufferSentAndCorrectFailCompletionReturned,
|
||||
IsAtLeastXeHpCore) {
|
||||
using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER;
|
||||
|
||||
auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver<FamilyType>();
|
||||
commandStreamReceiver.storeMakeResidentAllocations = true;
|
||||
commandStreamReceiver.recordFlusheBatchBuffer = true;
|
||||
|
||||
auto startOffset = commandStream.getUsed();
|
||||
auto immediateListCmdBufferAllocation = commandStream.getGraphicsAllocation();
|
||||
|
||||
*commandStream.getSpaceForCmd<COMPUTE_WALKER>() = FamilyType::cmdInitGpgpuWalker;
|
||||
|
||||
immediateFlushTaskFlags.blockingAppend = true;
|
||||
commandStreamReceiver.flushReturnValue = NEO::SubmissionStatus::FAILED;
|
||||
auto completionStamp = commandStreamReceiver.flushImmediateTask(commandStream, startOffset, immediateFlushTaskFlags, *pDevice);
|
||||
|
||||
auto csrCmdBufferAllocation = commandStreamReceiver.commandStream.getGraphicsAllocation();
|
||||
|
||||
TaskCountType currentTaskCountType = 1u;
|
||||
|
||||
EXPECT_EQ(NEO::CompletionStamp::failed, completionStamp.taskCount);
|
||||
EXPECT_EQ(0u, commandStreamReceiver.taskCount);
|
||||
EXPECT_EQ(0u, commandStreamReceiver.latestSentTaskCount);
|
||||
EXPECT_EQ(0u, commandStreamReceiver.latestFlushedTaskCount);
|
||||
|
||||
EXPECT_FALSE(commandStreamReceiver.isMadeResident(csrCmdBufferAllocation, currentTaskCountType));
|
||||
EXPECT_TRUE(commandStreamReceiver.isMadeResident(immediateListCmdBufferAllocation, currentTaskCountType));
|
||||
|
||||
BatchBuffer &recordedBatchBuffer = commandStreamReceiver.latestFlushedBatchBuffer;
|
||||
EXPECT_EQ(nullptr, recordedBatchBuffer.commandBufferAllocation);
|
||||
EXPECT_EQ(0u, recordedBatchBuffer.startOffset);
|
||||
EXPECT_EQ(false, recordedBatchBuffer.hasStallingCmds);
|
||||
EXPECT_EQ(false, recordedBatchBuffer.hasRelaxedOrderingDependencies);
|
||||
|
||||
completionStamp = commandStreamReceiver.flushImmediateTask(commandStream, startOffset, immediateFlushTaskFlags, *pDevice);
|
||||
|
||||
EXPECT_EQ(NEO::CompletionStamp::failed, completionStamp.taskCount);
|
||||
EXPECT_EQ(0u, commandStreamReceiver.taskCount);
|
||||
EXPECT_EQ(0u, commandStreamReceiver.latestSentTaskCount);
|
||||
EXPECT_EQ(0u, commandStreamReceiver.latestFlushedTaskCount);
|
||||
|
||||
EXPECT_FALSE(commandStreamReceiver.isMadeResident(immediateListCmdBufferAllocation, currentTaskCountType));
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue