Adapt command stream receiver to multiple active partitions

Related-To: NEO-6244

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz
2021-09-17 13:05:26 +00:00
committed by Compute-Runtime-Automation
parent 3bb2985462
commit 3b35ba504f
40 changed files with 214 additions and 208 deletions

View File

@@ -17,9 +17,6 @@ namespace ImplicitScaling {
extern bool apiSupport;
extern bool semaphoreProgrammingRequired;
extern bool crossTileAtomicSynchronization;
constexpr uint32_t partitionAddressOffsetDwords = 2u;
constexpr uint32_t partitionAddressOffset = sizeof(uint32_t) * partitionAddressOffsetDwords;
} // namespace ImplicitScaling
struct ImplicitScalingHelper {

View File

@@ -62,7 +62,7 @@ class AUBCommandStreamReceiverHw : public CommandStreamReceiverSimulatedHw<GfxFa
MOCKABLE_VIRTUAL void submitBatchBufferAub(uint64_t batchBufferGpuAddress, const void *batchBuffer, size_t batchBufferSize, uint32_t memoryBank, uint64_t entryBits);
void pollForCompletion() override;
void pollForCompletionImpl() override;
void waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool forcePowerSavingMode, uint32_t partitionCount, uint32_t offsetSize) override;
void waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool forcePowerSavingMode) override;
uint32_t getDumpHandle();
MOCKABLE_VIRTUAL void addContextToken(uint32_t dumpHandle);

View File

@@ -596,8 +596,8 @@ void AUBCommandStreamReceiverHw<GfxFamily>::pollForCompletionImpl() {
}
template <typename GfxFamily>
inline void AUBCommandStreamReceiverHw<GfxFamily>::waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool forcePowerSavingMode, uint32_t partitionCount, uint32_t offsetSize) {
CommandStreamReceiverSimulatedHw<GfxFamily>::waitForTaskCountWithKmdNotifyFallback(taskCountToWait, flushStampToWait, useQuickKmdSleep, forcePowerSavingMode, partitionCount, offsetSize);
inline void AUBCommandStreamReceiverHw<GfxFamily>::waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool forcePowerSavingMode) {
CommandStreamReceiverSimulatedHw<GfxFamily>::waitForTaskCountWithKmdNotifyFallback(taskCountToWait, flushStampToWait, useQuickKmdSleep, forcePowerSavingMode);
pollForCompletion();
}

View File

@@ -257,10 +257,6 @@ void CommandStreamReceiver::cleanupResources() {
}
bool CommandStreamReceiver::waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait) {
return waitForCompletionWithTimeout(getTagAddress(), enableTimeout, timeoutMicroseconds, taskCountToWait, 1u, 0u);
}
bool CommandStreamReceiver::waitForCompletionWithTimeout(volatile uint32_t *pollAddress, bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait, uint32_t partitionCount, uint32_t offsetSize) {
std::chrono::high_resolution_clock::time_point time1, time2;
int64_t timeDiff = 0;
@@ -275,10 +271,9 @@ bool CommandStreamReceiver::waitForCompletionWithTimeout(volatile uint32_t *poll
}
}
volatile uint32_t *partitionAddress = pollAddress;
volatile uint32_t *partitionAddress = getTagAddress();
time1 = std::chrono::high_resolution_clock::now();
for (uint32_t i = 0; i < partitionCount; i++) {
for (uint32_t i = 0; i < activePartitions; i++) {
while (*partitionAddress < taskCountToWait && timeDiff <= timeoutMicroseconds) {
if (WaitUtils::waitFunction(partitionAddress, taskCountToWait)) {
break;
@@ -290,16 +285,16 @@ bool CommandStreamReceiver::waitForCompletionWithTimeout(volatile uint32_t *poll
}
}
partitionAddress = ptrOffset(partitionAddress, offsetSize);
partitionAddress = ptrOffset(partitionAddress, CommonConstants::partitionAddressOffset);
}
partitionAddress = pollAddress;
for (uint32_t i = 0; i < partitionCount; i++) {
partitionAddress = getTagAddress();
for (uint32_t i = 0; i < activePartitions; i++) {
if (*partitionAddress < taskCountToWait) {
return false;
}
partitionAddress = ptrOffset(partitionAddress, offsetSize);
partitionAddress = ptrOffset(partitionAddress, CommonConstants::partitionAddressOffset);
}
return true;
}

View File

@@ -124,7 +124,7 @@ class CommandStreamReceiver {
MOCKABLE_VIRTUAL volatile uint32_t *getTagAddress() const { return tagAddress; }
uint64_t getDebugPauseStateGPUAddress() const { return tagAllocation->getGpuAddress() + debugPauseStateAddressOffset; }
virtual bool waitForFlushStamp(FlushStamp &flushStampToWait, uint32_t partitionCount, uint32_t offsetSize) { return true; };
virtual bool waitForFlushStamp(FlushStamp &flushStampToWait) { return true; };
uint32_t peekTaskCount() const { return taskCount; }
@@ -156,9 +156,8 @@ class CommandStreamReceiver {
void requestStallingPipeControlOnNextFlush() { stallingPipeControlOnNextFlushRequired = true; }
bool isStallingPipeControlOnNextFlushRequired() const { return stallingPipeControlOnNextFlushRequired; }
virtual void waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool forcePowerSavingMode, uint32_t partitionCount, uint32_t offsetSize) = 0;
virtual void waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool forcePowerSavingMode) = 0;
virtual bool waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait);
MOCKABLE_VIRTUAL bool waitForCompletionWithTimeout(volatile uint32_t *pollAddress, bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait, uint32_t partitionCount, uint32_t offsetSize);
virtual void downloadAllocations(){};
void setSamplerCacheFlushRequired(SamplerCacheFlushState value) { this->samplerCacheFlushRequired = value; }
@@ -267,6 +266,14 @@ class CommandStreamReceiver {
return this->streamProperties;
}
inline void setActivePartitions(uint32_t newPartitionCount) {
activePartitions = newPartitionCount;
}
inline uint32_t getActivePartitions() const {
return activePartitions;
}
protected:
void cleanupResources();
void printDeviceIndex();
@@ -291,8 +298,9 @@ class CommandStreamReceiver {
ExecutionEnvironment &executionEnvironment;
LinearStream commandStream;
StreamProperties streamProperties{};
// offset for debug state must be 8 bytes, if only 4 bytes are used tag writes overwrite it
// offset for debug state must be 64 bytes, tag writes can use multiple dwords for multiple partitions
const uint64_t debugPauseStateAddressOffset = MemoryConstants::cacheLineSize;
uint64_t totalMemoryUsed = 0u;
@@ -338,7 +346,7 @@ class CommandStreamReceiver {
uint32_t lastAdditionalKernelExecInfo = AdditionalKernelExecInfo::NotSet;
KernelExecutionType lastKernelExecutionType = KernelExecutionType::Default;
MemoryCompressionState lastMemoryCompressionState = MemoryCompressionState::NotApplicable;
StreamProperties streamProperties{};
uint32_t activePartitions = 1;
const uint32_t rootDeviceIndex;
const DeviceBitfield deviceBitfield;

View File

@@ -75,7 +75,7 @@ class CommandStreamReceiverHw : public CommandStreamReceiver {
bool isPipelineSelectAlreadyProgrammed() const;
void programComputeMode(LinearStream &csr, DispatchFlags &dispatchFlags, const HardwareInfo &hwInfo);
void waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool forcePowerSavingMode, uint32_t partitionCount, uint32_t offsetSize) override;
void waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool forcePowerSavingMode) override;
const HardwareInfo &peekHwInfo() const;
void collectStateBaseAddresPatchInfo(

View File

@@ -230,6 +230,7 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
PipeControlArgs args(dispatchFlags.dcFlush);
args.notifyEnable = isUsedNotifyEnableForPostSync();
args.tlbInvalidation |= dispatchFlags.memoryMigrationRequired;
args.workloadPartitionOffset = this->activePartitions > 1;
MemorySynchronizationCommands<GfxFamily>::addPipeControlAndProgramPostSyncOperation(
commandStreamTask,
PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
@@ -865,7 +866,7 @@ inline void CommandStreamReceiverHw<GfxFamily>::emitNoop(LinearStream &commandSt
}
template <typename GfxFamily>
inline void CommandStreamReceiverHw<GfxFamily>::waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool forcePowerSavingMode, uint32_t partitionCount, uint32_t offsetSize) {
inline void CommandStreamReceiverHw<GfxFamily>::waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool forcePowerSavingMode) {
updateTagFromWait();
int64_t waitTimeout = 0;
@@ -877,20 +878,11 @@ inline void CommandStreamReceiverHw<GfxFamily>::waitForTaskCountWithKmdNotifyFal
"\nWaiting for task count %u at location %p. Current value: %u\n",
taskCountToWait, getTagAddress(), *getTagAddress());
bool status;
if (partitionCount > 1) {
status = waitForCompletionWithTimeout(getTagAddress(), enableTimeout, waitTimeout, taskCountToWait, partitionCount, offsetSize);
} else {
status = waitForCompletionWithTimeout(enableTimeout, waitTimeout, taskCountToWait);
}
bool status = waitForCompletionWithTimeout(enableTimeout, waitTimeout, taskCountToWait);
if (!status) {
waitForFlushStamp(flushStampToWait, partitionCount, offsetSize);
waitForFlushStamp(flushStampToWait);
//now call blocking wait, this is to ensure that task count is reached
if (partitionCount > 1) {
status = waitForCompletionWithTimeout(getTagAddress(), false, 0, taskCountToWait, partitionCount, offsetSize);
} else {
status = waitForCompletionWithTimeout(false, 0, taskCountToWait);
}
status = waitForCompletionWithTimeout(false, 0, taskCountToWait);
}
UNRECOVERABLE_IF(*getTagAddress() < taskCountToWait);
@@ -1125,7 +1117,7 @@ uint32_t CommandStreamReceiverHw<GfxFamily>::blitBuffer(const BlitPropertiesCont
lock.unlock();
if (blocking) {
waitForTaskCountWithKmdNotifyFallback(newTaskCount, flushStampToWait, false, false, 1, 0);
waitForTaskCountWithKmdNotifyFallback(newTaskCount, flushStampToWait, false, false);
internalAllocationStorage->cleanAllocationList(newTaskCount, TEMPORARY_ALLOCATION);
}

View File

@@ -40,8 +40,7 @@ class CommandStreamReceiverWithAUBDump : public BaseCSR {
}
void waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait,
bool useQuickKmdSleep, bool forcePowerSavingMode,
uint32_t partitionCount, uint32_t offsetSize) override;
bool useQuickKmdSleep, bool forcePowerSavingMode) override;
size_t getPreferredTagPoolSize() const override { return 1; }

View File

@@ -70,13 +70,12 @@ void CommandStreamReceiverWithAUBDump<BaseCSR>::setupContext(OsContext &osContex
template <typename BaseCSR>
void CommandStreamReceiverWithAUBDump<BaseCSR>::waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait,
bool useQuickKmdSleep, bool forcePowerSavingMode,
uint32_t partitionCount, uint32_t offsetSize) {
bool useQuickKmdSleep, bool forcePowerSavingMode) {
if (aubCSR) {
aubCSR->waitForTaskCountWithKmdNotifyFallback(taskCountToWait, flushStampToWait, useQuickKmdSleep, forcePowerSavingMode, partitionCount, offsetSize);
aubCSR->waitForTaskCountWithKmdNotifyFallback(taskCountToWait, flushStampToWait, useQuickKmdSleep, forcePowerSavingMode);
}
BaseCSR::waitForTaskCountWithKmdNotifyFallback(taskCountToWait, flushStampToWait, useQuickKmdSleep, forcePowerSavingMode, partitionCount, offsetSize);
BaseCSR::waitForTaskCountWithKmdNotifyFallback(taskCountToWait, flushStampToWait, useQuickKmdSleep, forcePowerSavingMode);
}
template <typename BaseCSR>

View File

@@ -43,7 +43,7 @@ class TbxCommandStreamReceiverHw : public CommandStreamReceiverSimulatedHw<GfxFa
bool flush(BatchBuffer &batchBuffer, ResidencyContainer &allocationsForResidency) override;
void waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool forcePowerSavingMode, uint32_t partitionCount, uint32_t offsetSize) override;
void waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool forcePowerSavingMode) override;
bool waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait) override;
void downloadAllocations() override;

View File

@@ -479,9 +479,9 @@ void TbxCommandStreamReceiverHw<GfxFamily>::flushSubmissionsAndDownloadAllocatio
}
template <typename GfxFamily>
void TbxCommandStreamReceiverHw<GfxFamily>::waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool forcePowerSavingMode, uint32_t partitionCount, uint32_t offsetSize) {
void TbxCommandStreamReceiverHw<GfxFamily>::waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool forcePowerSavingMode) {
flushSubmissionsAndDownloadAllocations();
BaseClass::waitForTaskCountWithKmdNotifyFallback(taskCountToWait, flushStampToWait, useQuickKmdSleep, forcePowerSavingMode, partitionCount, offsetSize);
BaseClass::waitForTaskCountWithKmdNotifyFallback(taskCountToWait, flushStampToWait, useQuickKmdSleep, forcePowerSavingMode);
}
template <typename GfxFamily>

View File

@@ -89,4 +89,6 @@ constexpr uint32_t unspecifiedDeviceIndex = std::numeric_limits<uint32_t>::max()
constexpr uint32_t invalidStepping = std::numeric_limits<uint32_t>::max();
constexpr uint32_t maximalSimdSize = 32;
constexpr uint32_t engineGroupCount = static_cast<uint32_t>(NEO::EngineGroupType::MaxEngineGroups);
constexpr uint32_t partitionAddressOffsetDwords = 2u;
constexpr uint32_t partitionAddressOffset = sizeof(uint32_t) * partitionAddressOffsetDwords;
} // namespace CommonConstants

View File

@@ -47,7 +47,7 @@ class DrmCommandStreamReceiver : public DeviceCommandStreamReceiver<GfxFamily> {
bool flush(BatchBuffer &batchBuffer, ResidencyContainer &allocationsForResidency) override;
MOCKABLE_VIRTUAL void processResidency(const ResidencyContainer &allocationsForResidency, uint32_t handleId) override;
void makeNonResident(GraphicsAllocation &gfxAllocation) override;
bool waitForFlushStamp(FlushStamp &flushStampToWait, uint32_t partitionCount, uint32_t offsetSize) override;
bool waitForFlushStamp(FlushStamp &flushStampToWait) override;
bool isKmdWaitModeActive() override;
DrmMemoryManager *getMemoryManager() const;
@@ -66,7 +66,7 @@ class DrmCommandStreamReceiver : public DeviceCommandStreamReceiver<GfxFamily> {
protected:
MOCKABLE_VIRTUAL void flushInternal(const BatchBuffer &batchBuffer, const ResidencyContainer &allocationsForResidency);
MOCKABLE_VIRTUAL void exec(const BatchBuffer &batchBuffer, uint32_t vmHandleId, uint32_t drmContextId);
MOCKABLE_VIRTUAL int waitUserFence(uint32_t waitValue, uint32_t partitionCount, uint32_t offsetSize);
MOCKABLE_VIRTUAL int waitUserFence(uint32_t waitValue);
bool isUserFenceWaitActive();
std::vector<BufferObject *> residency;

View File

@@ -228,10 +228,10 @@ GmmPageTableMngr *DrmCommandStreamReceiver<GfxFamily>::createPageTableManager()
}
template <typename GfxFamily>
bool DrmCommandStreamReceiver<GfxFamily>::waitForFlushStamp(FlushStamp &flushStamp, uint32_t partitionCount, uint32_t offsetSize) {
bool DrmCommandStreamReceiver<GfxFamily>::waitForFlushStamp(FlushStamp &flushStamp) {
auto waitValue = static_cast<uint32_t>(flushStamp);
if (isUserFenceWaitActive()) {
waitUserFence(waitValue, partitionCount, offsetSize);
waitUserFence(waitValue);
} else {
this->drm->waitHandle(waitValue, kmdWaitTimeout);
}

View File

@@ -17,7 +17,7 @@ void DrmCommandStreamReceiver<GfxFamily>::flushInternal(const BatchBuffer &batch
}
template <typename GfxFamily>
int DrmCommandStreamReceiver<GfxFamily>::waitUserFence(uint32_t waitValue, uint32_t partitionCount, uint32_t offsetSize) {
int DrmCommandStreamReceiver<GfxFamily>::waitUserFence(uint32_t waitValue) {
uint32_t ctxId = 0u;
uint64_t tagAddress = castToUint64(const_cast<uint32_t *>(getTagAddress()));
if (useContextForUserFenceWait) {

View File

@@ -50,7 +50,7 @@ void DrmCommandStreamReceiver<GfxFamily>::flushInternal(const BatchBuffer &batch
}
template <typename GfxFamily>
int DrmCommandStreamReceiver<GfxFamily>::waitUserFence(uint32_t waitValue, uint32_t partitionCount, uint32_t offsetSize) {
int DrmCommandStreamReceiver<GfxFamily>::waitUserFence(uint32_t waitValue) {
int ret = 0;
StackVec<uint32_t, 32> ctxIds;
uint64_t tagAddress = castToUint64(const_cast<uint32_t *>(getTagAddress()));
@@ -62,15 +62,15 @@ int DrmCommandStreamReceiver<GfxFamily>::waitUserFence(uint32_t waitValue, uint3
ctxIds.push_back(ctxId);
}
}
UNRECOVERABLE_IF(ctxIds.size() != partitionCount);
for (uint32_t i = 0; i < partitionCount; i++) {
UNRECOVERABLE_IF(ctxIds.size() != this->activePartitions);
for (uint32_t i = 0; i < this->activePartitions; i++) {
ret |= this->drm->waitUserFence(ctxIds[i], tagAddress, waitValue, Drm::ValueWidth::U32, kmdWaitTimeout, 0u);
tagAddress += offsetSize;
tagAddress += CommonConstants::partitionAddressOffset;
}
} else {
for (uint32_t i = 0; i < partitionCount; i++) {
for (uint32_t i = 0; i < this->activePartitions; i++) {
ret |= this->drm->waitUserFence(0u, tagAddress, waitValue, Drm::ValueWidth::U32, kmdWaitTimeout, 0u);
tagAddress += offsetSize;
tagAddress += CommonConstants::partitionAddressOffset;
}
}

View File

@@ -27,7 +27,7 @@ class WddmCommandStreamReceiver : public DeviceCommandStreamReceiver<GfxFamily>
bool flush(BatchBuffer &batchBuffer, ResidencyContainer &allocationsForResidency) override;
void processResidency(const ResidencyContainer &allocationsForResidency, uint32_t handleId) override;
void processEviction() override;
bool waitForFlushStamp(FlushStamp &flushStampToWait, uint32_t partitionCount, uint32_t offsetSize) override;
bool waitForFlushStamp(FlushStamp &flushStampToWait) override;
WddmMemoryManager *getMemoryManager() const;
Wddm *peekWddm() const {

View File

@@ -131,7 +131,7 @@ WddmMemoryManager *WddmCommandStreamReceiver<GfxFamily>::getMemoryManager() cons
}
template <typename GfxFamily>
bool WddmCommandStreamReceiver<GfxFamily>::waitForFlushStamp(FlushStamp &flushStampToWait, uint32_t partitionCount, uint32_t offsetSize) {
bool WddmCommandStreamReceiver<GfxFamily>::waitForFlushStamp(FlushStamp &flushStampToWait) {
return wddm->waitFromCpu(flushStampToWait, static_cast<OsContextWin *>(this->osContext)->getResidencyController().getMonitoredFence());
}