Enable multi-tile task count post-sync writes

Related-To: NEO-6244

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz
2021-09-23 18:13:37 +00:00
committed by Compute-Runtime-Automation
parent 91e9587a07
commit 6091861f3e
13 changed files with 321 additions and 33 deletions

View File

@@ -302,7 +302,11 @@ bool AUBCommandStreamReceiverHw<GfxFamily>::flush(BatchBuffer &batchBuffer, Resi
if (subCaptureManager->isSubCaptureMode()) {
if (!subCaptureManager->isSubCaptureEnabled()) {
if (this->standalone) {
*this->tagAddress = this->peekLatestSentTaskCount();
volatile uint32_t *pollAddress = this->tagAddress;
for (uint32_t i = 0; i < this->activePartitions; i++) {
*pollAddress = this->peekLatestSentTaskCount();
pollAddress = ptrOffset(pollAddress, CommonConstants::partitionAddressOffset);
}
}
return true;
}
@@ -339,7 +343,11 @@ bool AUBCommandStreamReceiverHw<GfxFamily>::flush(BatchBuffer &batchBuffer, Resi
submitBatchBufferAub(batchBufferGpuAddress, pBatchBuffer, sizeBatchBuffer, this->getMemoryBank(batchBuffer.commandBufferAllocation), this->getPPGTTAdditionalBits(batchBuffer.commandBufferAllocation));
if (this->standalone) {
*this->tagAddress = this->peekLatestSentTaskCount();
volatile uint32_t *pollAddress = this->tagAddress;
for (uint32_t i = 0; i < this->activePartitions; i++) {
*pollAddress = this->peekLatestSentTaskCount();
pollAddress = ptrOffset(pollAddress, CommonConstants::partitionAddressOffset);
}
}
if (subCaptureManager->isSubCaptureMode()) {

View File

@@ -347,6 +347,7 @@ class CommandStreamReceiver {
KernelExecutionType lastKernelExecutionType = KernelExecutionType::Default;
MemoryCompressionState lastMemoryCompressionState = MemoryCompressionState::NotApplicable;
uint32_t activePartitions = 1;
uint32_t activePartitionsConfig = 1;
const uint32_t rootDeviceIndex;
const DeviceBitfield deviceBitfield;

View File

@@ -70,6 +70,7 @@ class CommandStreamReceiverHw : public CommandStreamReceiver {
size_t getCmdSizeForMediaSampler(bool mediaSamplerRequired) const;
size_t getCmdSizeForEngineMode(const DispatchFlags &dispatchFlags) const;
size_t getCmdSizeForPerDssBackedBuffer(const HardwareInfo &hwInfo);
size_t getCmdSizeForActivePartitionConfig() const;
bool isComputeModeNeeded() const;
bool isPipelineSelectAlreadyProgrammed() const;
@@ -147,6 +148,7 @@ class CommandStreamReceiverHw : public CommandStreamReceiver {
void programStallingPipeControlForBarrier(LinearStream &cmdStream, DispatchFlags &dispatchFlags);
void programEngineModeCommands(LinearStream &csr, const DispatchFlags &dispatchFlags);
void programEngineModeEpliogue(LinearStream &csr, const DispatchFlags &dispatchFlags);
void programActivePartitionConfig();
void programEnginePrologue(LinearStream &csr);
size_t getCmdSizeForPrologue() const;

View File

@@ -230,7 +230,7 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
PipeControlArgs args(dispatchFlags.dcFlush);
args.notifyEnable = isUsedNotifyEnableForPostSync();
args.tlbInvalidation |= dispatchFlags.memoryMigrationRequired;
args.workloadPartitionOffset = this->activePartitions > 1;
args.workloadPartitionOffset = this->activePartitions > 1 && this->staticWorkPartitioningEnabled;
MemorySynchronizationCommands<GfxFamily>::addPipeControlAndProgramPostSyncOperation(
commandStreamTask,
PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
@@ -280,6 +280,9 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
csrSizeRequestFlags.numGrfRequiredChanged = this->lastSentNumGrfRequired != dispatchFlags.numGrfRequired;
lastSentNumGrfRequired = dispatchFlags.numGrfRequired;
csrSizeRequestFlags.activePartitionsChanged = this->activePartitionsConfig != this->activePartitions;
this->activePartitionsConfig = this->activePartitions;
if (dispatchFlags.threadArbitrationPolicy != ThreadArbitrationPolicy::NotPresent) {
this->requiredThreadArbitrationPolicy = dispatchFlags.threadArbitrationPolicy;
}
@@ -356,6 +359,7 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
programVFEState(commandStreamCSR, dispatchFlags, device.getDeviceInfo().maxFrontEndThreads);
programPreemption(commandStreamCSR, dispatchFlags);
programActivePartitionConfig();
bool dshDirty = dshState.updateAndCheck(&dsh);
bool iohDirty = iohState.updateAndCheck(&ioh);
@@ -811,6 +815,7 @@ size_t CommandStreamReceiverHw<GfxFamily>::getRequiredCmdStreamSize(const Dispat
size += getCmdSizeForPerDssBackedBuffer(device.getHardwareInfo());
size += getCmdSizeForEpilogue(dispatchFlags);
size += getCmdsSizeForHardwareContext();
size += getCmdSizeForActivePartitionConfig();
if (executionEnvironment.rootDeviceEnvironments[rootDeviceIndex]->getHardwareInfo()->workaroundTable.waSamplerCacheFlushBetweenRedescribedSurfaceReads) {
if (this->samplerCacheFlushRequired != SamplerCacheFlushState::samplerCacheFlushNotRequired) {

View File

@@ -118,4 +118,13 @@ size_t CommandStreamReceiverHw<GfxFamily>::getCmdSizeForPerDssBackedBuffer(const
return 0;
}
template <typename GfxFamily>
size_t CommandStreamReceiverHw<GfxFamily>::getCmdSizeForActivePartitionConfig() const {
return 0;
}
template <typename GfxFamily>
void CommandStreamReceiverHw<GfxFamily>::programActivePartitionConfig() {
}
} // namespace NEO

View File

@@ -5,6 +5,7 @@
*
*/
#include "shared/source/command_container/implicit_scaling.h"
#include "shared/source/command_stream/command_stream_receiver_hw_base.inl"
#include "shared/source/command_stream/device_command_stream.h"
#include "shared/source/command_stream/scratch_space_controller_xehp_and_later.h"
@@ -138,4 +139,27 @@ GraphicsAllocation *CommandStreamReceiverHw<GfxFamily>::getClearColorAllocation(
return nullptr;
}
template <typename GfxFamily>
size_t CommandStreamReceiverHw<GfxFamily>::getCmdSizeForActivePartitionConfig() const {
if (this->staticWorkPartitioningEnabled && csrSizeRequestFlags.activePartitionsChanged) {
return EncodeSetMMIO<GfxFamily>::sizeMEM +
EncodeSetMMIO<GfxFamily>::sizeIMM;
}
return 0;
}
template <typename GfxFamily>
void CommandStreamReceiverHw<GfxFamily>::programActivePartitionConfig() {
if (this->staticWorkPartitioningEnabled && csrSizeRequestFlags.activePartitionsChanged) {
uint64_t workPartitionAddress = getWorkPartitionAllocationGpuAddress();
EncodeSetMMIO<GfxFamily>::encodeMEM(commandStream,
PartitionRegisters<GfxFamily>::wparidCCSOffset,
workPartitionAddress);
EncodeSetMMIO<GfxFamily>::encodeIMM(commandStream,
PartitionRegisters<GfxFamily>::addressOffsetCCSOffset,
CommonConstants::partitionAddressOffset,
true);
}
}
} // namespace NEO

View File

@@ -123,5 +123,6 @@ struct CsrSizeRequestFlags {
bool hasSharedHandles = false;
bool numGrfRequiredChanged = false;
bool specialPipelineSelectModeChanged = false;
bool activePartitionsChanged = false;
};
} // namespace NEO

View File

@@ -468,8 +468,12 @@ template <typename GfxFamily>
void TbxCommandStreamReceiverHw<GfxFamily>::flushSubmissionsAndDownloadAllocations() {
this->flushBatchedSubmissions();
while (*this->getTagAddress() < this->latestFlushedTaskCount) {
downloadAllocation(*this->getTagAllocation());
volatile uint32_t *pollAddress = this->getTagAddress();
for (uint32_t i = 0; i < this->activePartitions; i++) {
while (*pollAddress < this->latestFlushedTaskCount) {
downloadAllocation(*this->getTagAllocation());
}
pollAddress = ptrOffset(pollAddress, CommonConstants::partitionAddressOffset);
}
for (GraphicsAllocation *graphicsAllocation : this->allocationsForDownload) {
@@ -535,8 +539,12 @@ void TbxCommandStreamReceiverHw<GfxFamily>::downloadAllocation(GraphicsAllocatio
template <typename GfxFamily>
void TbxCommandStreamReceiverHw<GfxFamily>::downloadAllocations() {
while (*this->getTagAddress() < this->latestFlushedTaskCount) {
downloadAllocation(*this->getTagAllocation());
volatile uint32_t *pollAddress = this->getTagAddress();
for (uint32_t i = 0; i < this->activePartitions; i++) {
while (*pollAddress < this->latestFlushedTaskCount) {
downloadAllocation(*this->getTagAllocation());
}
pollAddress = ptrOffset(pollAddress, CommonConstants::partitionAddressOffset);
}
for (GraphicsAllocation *graphicsAllocation : this->allocationsForDownload) {
downloadAllocation(*graphicsAllocation);