performance: immediate flush add flushing mechanism to gpu

Related-To: NEO-7808

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz 2023-06-27 19:54:20 +00:00 committed by Compute-Runtime-Automation
parent df62888efc
commit eb4e7fb2a6
4 changed files with 184 additions and 12 deletions

View File

@ -27,6 +27,7 @@ class CommandStreamReceiverHw : public CommandStreamReceiver {
PipelineSelectArgs pipelineSelectArgs{};
size_t estimatedSize = 0;
void *endPtr = nullptr;
size_t csrStartOffset = 0;
bool pipelineSelectFullConfigurationNeeded = false;
bool pipelineSelectDirty = false;
@ -262,7 +263,16 @@ class CommandStreamReceiverHw : public CommandStreamReceiver {
LinearStream &immediateCommandStream,
ImmediateFlushData &flushData);
inline void handleImmediateFlushAllocationsResidency(Device &device);
inline void handleImmediateFlushAllocationsResidency(Device &device,
LinearStream &immediateCommandStream,
ImmediateFlushData &flushData,
LinearStream &csrStream);
inline CompletionStamp handleImmediateFlushSendBatchBuffer(LinearStream &immediateCommandStream,
size_t immediateCommandStreamStart,
ImmediateDispatchFlags &dispatchFlags,
ImmediateFlushData &flushData,
LinearStream &csrStream);
HeapDirtyState dshState;
HeapDirtyState iohState;

View File

@ -308,6 +308,7 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushImmediateTask(
handleImmediateFlushJumpToImmediate(flushData);
auto &csrCommandStream = getCS(flushData.estimatedSize);
flushData.csrStartOffset = csrCommandStream.getUsed();
dispatchImmediateFlushPipelineSelectCommand(flushData, csrCommandStream);
dispatchImmediateFlushFrontEndCommand(scratchAddress, flushData, device, csrCommandStream);
@ -318,16 +319,17 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushImmediateTask(
dispatchImmediateFlushJumpToImmediateCommand(immediateCommandStream, immediateCommandStreamStart, flushData, csrCommandStream);
dispatchImmediateFlushClientBufferCommands(dispatchFlags, immediateCommandStream, flushData);
this->latestSentTaskCount = taskCount + 1;
handleImmediateFlushAllocationsResidency(device);
handleImmediateFlushAllocationsResidency(device,
immediateCommandStream,
flushData,
csrCommandStream);
++taskCount;
CompletionStamp completionStamp = {
this->taskCount,
this->taskLevel,
flushStamp->peekStamp()};
return completionStamp;
return handleImmediateFlushSendBatchBuffer(immediateCommandStream,
immediateCommandStreamStart,
dispatchFlags,
flushData,
csrCommandStream);
}
template <typename GfxFamily>
@ -2044,7 +2046,10 @@ void CommandStreamReceiverHw<GfxFamily>::dispatchImmediateFlushOneTimeContextIni
}
template <typename GfxFamily>
void CommandStreamReceiverHw<GfxFamily>::handleImmediateFlushAllocationsResidency(Device &device) {
void CommandStreamReceiverHw<GfxFamily>::handleImmediateFlushAllocationsResidency(Device &device,
LinearStream &immediateCommandStream,
ImmediateFlushData &flushData,
LinearStream &csrStream) {
this->makeResident(*tagAllocation);
if (globalFenceAllocation) {
@ -2058,6 +2063,10 @@ void CommandStreamReceiverHw<GfxFamily>::handleImmediateFlushAllocationsResidenc
if (device.getRTMemoryBackedBuffer()) {
makeResident(*device.getRTMemoryBackedBuffer());
}
if (flushData.estimatedSize > 0) {
makeResident(*csrStream.getGraphicsAllocation());
}
}
template <typename GfxFamily>
@ -2099,8 +2108,6 @@ void CommandStreamReceiverHw<GfxFamily>::dispatchImmediateFlushClientBufferComma
this->taskCount + 1,
peekRootDeviceEnvironment(),
args);
this->latestFlushedTaskCount = this->taskCount + 1;
}
makeResident(*immediateCommandStream.getGraphicsAllocation());
@ -2109,4 +2116,53 @@ void CommandStreamReceiverHw<GfxFamily>::dispatchImmediateFlushClientBufferComma
EncodeNoop<GfxFamily>::alignToCacheLine(immediateCommandStream);
}
template <typename GfxFamily>
CompletionStamp CommandStreamReceiverHw<GfxFamily>::handleImmediateFlushSendBatchBuffer(LinearStream &immediateCommandStream,
size_t immediateCommandStreamStart,
ImmediateDispatchFlags &dispatchFlags,
ImmediateFlushData &flushData,
LinearStream &csrStream) {
this->latestSentTaskCount = taskCount + 1;
bool startFromCsr = flushData.estimatedSize > 0;
size_t startOffset = startFromCsr ? flushData.csrStartOffset : immediateCommandStreamStart;
auto &streamToSubmit = startFromCsr ? csrStream : immediateCommandStream;
GraphicsAllocation *chainedBatchBuffer = startFromCsr ? immediateCommandStream.getGraphicsAllocation() : nullptr;
size_t chainedBatchBufferStartOffset = startFromCsr ? csrStream.getUsed() : 0;
uint64_t taskStartAddress = immediateCommandStream.getGpuBase() + immediateCommandStreamStart;
bool hasStallingCmds = (startFromCsr || dispatchFlags.blockingAppend || dispatchFlags.hasStallingCmds);
constexpr bool immediateRequiresCoherency = false;
constexpr bool immediateLowPriority = false;
constexpr QueueThrottle immediateThrottle = QueueThrottle::MEDIUM;
constexpr uint64_t immediateSliceCount = QueueSliceCount::defaultSliceCount;
BatchBuffer batchBuffer{streamToSubmit.getGraphicsAllocation(), startOffset, chainedBatchBufferStartOffset, taskStartAddress, chainedBatchBuffer,
immediateRequiresCoherency, immediateLowPriority, immediateThrottle, immediateSliceCount,
streamToSubmit.getUsed(), &streamToSubmit, flushData.endPtr, this->getNumClients(), hasStallingCmds,
dispatchFlags.hasRelaxedOrderingDependencies};
updateStreamTaskCount(streamToSubmit, taskCount + 1);
auto submissionStatus = flushHandler(batchBuffer, this->getResidencyAllocations());
if (submissionStatus != SubmissionStatus::SUCCESS) {
--this->latestSentTaskCount;
updateStreamTaskCount(streamToSubmit, taskCount);
CompletionStamp completionStamp = {CompletionStamp::getTaskCountFromSubmissionStatusError(submissionStatus)};
return completionStamp;
} else {
if (dispatchFlags.blockingAppend) {
this->latestFlushedTaskCount = this->taskCount + 1;
}
++taskCount;
CompletionStamp completionStamp = {
this->taskCount,
this->taskLevel,
flushStamp->peekStamp()};
return completionStamp;
}
}
} // namespace NEO

View File

@ -141,6 +141,7 @@ struct ImmediateDispatchFlags {
void *sshCpuBase = nullptr;
bool blockingAppend = false;
bool hasRelaxedOrderingDependencies = false;
bool hasStallingCmds = false;
};
} // namespace NEO

View File

@ -3894,3 +3894,108 @@ HWTEST2_F(CommandStreamReceiverHwTest,
startOffset = commandStream.getUsed();
EXPECT_EQ(0u, (startOffset % MemoryConstants::cacheLineSize));
}
HWTEST2_F(CommandStreamReceiverHwTest,
givenImmediateFlushTaskWhenPreambleIsUsedOrNotThenCsrBufferIsUsedOrImmediateBufferIsUsed,
IsAtLeastXeHpCore) {
using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER;
auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver<FamilyType>();
commandStreamReceiver.storeMakeResidentAllocations = true;
commandStreamReceiver.recordFlusheBatchBuffer = true;
auto startOffset = commandStream.getUsed();
auto immediateListCmdBufferAllocation = commandStream.getGraphicsAllocation();
*commandStream.getSpaceForCmd<COMPUTE_WALKER>() = FamilyType::cmdInitGpgpuWalker;
immediateFlushTaskFlags.hasStallingCmds = true;
auto completionStamp = commandStreamReceiver.flushImmediateTask(commandStream, startOffset, immediateFlushTaskFlags, *pDevice);
auto csrCmdBufferAllocation = commandStreamReceiver.commandStream.getGraphicsAllocation();
TaskCountType currentTaskCountType = 1u;
EXPECT_EQ(currentTaskCountType, completionStamp.taskCount);
EXPECT_EQ(currentTaskCountType, commandStreamReceiver.taskCount);
EXPECT_EQ(currentTaskCountType, commandStreamReceiver.latestSentTaskCount);
EXPECT_EQ(0u, commandStreamReceiver.latestFlushedTaskCount);
EXPECT_TRUE(commandStreamReceiver.isMadeResident(csrCmdBufferAllocation, currentTaskCountType));
EXPECT_TRUE(commandStreamReceiver.isMadeResident(immediateListCmdBufferAllocation, currentTaskCountType));
BatchBuffer &recordedBatchBuffer = commandStreamReceiver.latestFlushedBatchBuffer;
EXPECT_EQ(csrCmdBufferAllocation, recordedBatchBuffer.commandBufferAllocation);
EXPECT_EQ(0u, recordedBatchBuffer.startOffset);
EXPECT_EQ(true, recordedBatchBuffer.hasStallingCmds);
EXPECT_EQ(false, recordedBatchBuffer.hasRelaxedOrderingDependencies);
startOffset = commandStream.getUsed();
*commandStream.getSpaceForCmd<COMPUTE_WALKER>() = FamilyType::cmdInitGpgpuWalker;
immediateFlushTaskFlags.hasRelaxedOrderingDependencies = true;
completionStamp = commandStreamReceiver.flushImmediateTask(commandStream, startOffset, immediateFlushTaskFlags, *pDevice);
currentTaskCountType = 2u;
EXPECT_EQ(currentTaskCountType, completionStamp.taskCount);
EXPECT_EQ(currentTaskCountType, commandStreamReceiver.taskCount);
EXPECT_EQ(currentTaskCountType, commandStreamReceiver.latestSentTaskCount);
EXPECT_EQ(0u, commandStreamReceiver.latestFlushedTaskCount);
EXPECT_FALSE(commandStreamReceiver.isMadeResident(csrCmdBufferAllocation, currentTaskCountType));
EXPECT_TRUE(commandStreamReceiver.isMadeResident(immediateListCmdBufferAllocation, currentTaskCountType));
recordedBatchBuffer = commandStreamReceiver.latestFlushedBatchBuffer;
EXPECT_EQ(immediateListCmdBufferAllocation, recordedBatchBuffer.commandBufferAllocation);
EXPECT_EQ(startOffset, recordedBatchBuffer.startOffset);
EXPECT_EQ(true, recordedBatchBuffer.hasStallingCmds);
EXPECT_EQ(true, recordedBatchBuffer.hasRelaxedOrderingDependencies);
}
HWTEST2_F(CommandStreamReceiverHwTest,
givenImmediateFlushTaskWhenFlushOperationFailsThenExpectNoBatchBufferSentAndCorrectFailCompletionReturned,
IsAtLeastXeHpCore) {
using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER;
auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver<FamilyType>();
commandStreamReceiver.storeMakeResidentAllocations = true;
commandStreamReceiver.recordFlusheBatchBuffer = true;
auto startOffset = commandStream.getUsed();
auto immediateListCmdBufferAllocation = commandStream.getGraphicsAllocation();
*commandStream.getSpaceForCmd<COMPUTE_WALKER>() = FamilyType::cmdInitGpgpuWalker;
immediateFlushTaskFlags.blockingAppend = true;
commandStreamReceiver.flushReturnValue = NEO::SubmissionStatus::FAILED;
auto completionStamp = commandStreamReceiver.flushImmediateTask(commandStream, startOffset, immediateFlushTaskFlags, *pDevice);
auto csrCmdBufferAllocation = commandStreamReceiver.commandStream.getGraphicsAllocation();
TaskCountType currentTaskCountType = 1u;
EXPECT_EQ(NEO::CompletionStamp::failed, completionStamp.taskCount);
EXPECT_EQ(0u, commandStreamReceiver.taskCount);
EXPECT_EQ(0u, commandStreamReceiver.latestSentTaskCount);
EXPECT_EQ(0u, commandStreamReceiver.latestFlushedTaskCount);
EXPECT_FALSE(commandStreamReceiver.isMadeResident(csrCmdBufferAllocation, currentTaskCountType));
EXPECT_TRUE(commandStreamReceiver.isMadeResident(immediateListCmdBufferAllocation, currentTaskCountType));
BatchBuffer &recordedBatchBuffer = commandStreamReceiver.latestFlushedBatchBuffer;
EXPECT_EQ(nullptr, recordedBatchBuffer.commandBufferAllocation);
EXPECT_EQ(0u, recordedBatchBuffer.startOffset);
EXPECT_EQ(false, recordedBatchBuffer.hasStallingCmds);
EXPECT_EQ(false, recordedBatchBuffer.hasRelaxedOrderingDependencies);
completionStamp = commandStreamReceiver.flushImmediateTask(commandStream, startOffset, immediateFlushTaskFlags, *pDevice);
EXPECT_EQ(NEO::CompletionStamp::failed, completionStamp.taskCount);
EXPECT_EQ(0u, commandStreamReceiver.taskCount);
EXPECT_EQ(0u, commandStreamReceiver.latestSentTaskCount);
EXPECT_EQ(0u, commandStreamReceiver.latestFlushedTaskCount);
EXPECT_FALSE(commandStreamReceiver.isMadeResident(immediateListCmdBufferAllocation, currentTaskCountType));
}