Relaxed ordering scheduler section for DirectSubmission

Related-To: NEO-7458

Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com>
This commit is contained in:
Dunajski, Bartosz
2022-11-18 09:56:59 +00:00
committed by Compute-Runtime-Automation
parent 20cbdf8683
commit e6c1658bae
6 changed files with 931 additions and 125 deletions

View File

@@ -110,8 +110,10 @@ class DirectSubmissionHw {
void cpuCachelineFlush(void *ptr, size_t size);
void dispatchSemaphoreSection(uint32_t value);
size_t getSizeSemaphoreSection();
void dispatchSemaphoreSection(uint32_t value, bool firstSubmission);
size_t getSizeSemaphoreSection(bool firstSubmission);
void dispatchRelaxedOrderingSchedulerSection(uint32_t value);
void dispatchStartSection(uint64_t gpuStartAddress);
size_t getSizeStartSection();
@@ -122,6 +124,8 @@ class DirectSubmissionHw {
void dispatchTaskStoreSection(uint64_t taskStartSectionVa);
MOCKABLE_VIRTUAL void preinitializeTaskStoreSection();
void initRelaxedOrderingRegisters();
void setReturnAddress(void *returnCmd, uint64_t returnAddress);
void *dispatchWorkloadSection(BatchBuffer &batchBuffer);

View File

@@ -80,6 +80,145 @@ DirectSubmissionHw<GfxFamily, Dispatcher>::DirectSubmissionHw(const DirectSubmis
relaxedOrderingEnabled = (DebugManager.flags.DirectSubmissionRelaxedOrdering.get() == 1);
}
template <typename GfxFamily, typename Dispatcher>
void DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchRelaxedOrderingSchedulerSection(uint32_t value) {
uint64_t schedulerStartAddress = getCommandBufferPositionGpuAddress(ringCommandStream.getSpace(0));
uint64_t deferredTasksListGpuVa = deferredTasksListAllocation->getGpuAddress();
// 1. Init section
{
EncodeMiPredicate<GfxFamily>::encode(ringCommandStream, MiPredicateType::Disable);
EncodeBatchBufferStartOrEnd<GfxFamily>::programConditionalDataRegBatchBufferStart(ringCommandStream,
schedulerStartAddress + RelaxedOrderingHelper::SchedulerSizeAndOffsetSection<GfxFamily>::semaphoreSectionStart,
CS_GPR_R1, 0, CompareOperation::Equal, false);
LriHelper<GfxFamily>::program(&ringCommandStream, CS_GPR_R2, 0, true);
LriHelper<GfxFamily>::program(&ringCommandStream, CS_GPR_R2 + 4, 0, true);
uint64_t removeTaskVa = schedulerStartAddress + RelaxedOrderingHelper::SchedulerSizeAndOffsetSection<GfxFamily>::removeTaskSectionStart;
LriHelper<GfxFamily>::program(&ringCommandStream, CS_GPR_R3, static_cast<uint32_t>(removeTaskVa & 0xFFFF'FFFFULL), true);
LriHelper<GfxFamily>::program(&ringCommandStream, CS_GPR_R3 + 4, static_cast<uint32_t>(removeTaskVa >> 32), true);
uint64_t walkersLoopConditionCheckVa = schedulerStartAddress + RelaxedOrderingHelper::SchedulerSizeAndOffsetSection<GfxFamily>::tasksListLoopCheckSectionStart;
LriHelper<GfxFamily>::program(&ringCommandStream, CS_GPR_R4, static_cast<uint32_t>(walkersLoopConditionCheckVa & 0xFFFF'FFFFULL), true);
LriHelper<GfxFamily>::program(&ringCommandStream, CS_GPR_R4 + 4, static_cast<uint32_t>(walkersLoopConditionCheckVa >> 32), true);
}
// 2. Dispatch task section (loop start)
{
EncodeMiPredicate<GfxFamily>::encode(ringCommandStream, MiPredicateType::Disable);
LriHelper<GfxFamily>::program(&ringCommandStream, CS_GPR_R6, 8, true);
LriHelper<GfxFamily>::program(&ringCommandStream, CS_GPR_R6 + 4, 0, true);
LriHelper<GfxFamily>::program(&ringCommandStream, CS_GPR_R8, static_cast<uint32_t>(deferredTasksListGpuVa & 0xFFFF'FFFFULL), true);
LriHelper<GfxFamily>::program(&ringCommandStream, CS_GPR_R8 + 4, static_cast<uint32_t>(deferredTasksListGpuVa >> 32), true);
EncodeAluHelper<GfxFamily, 10> aluHelper;
aluHelper.setNextAlu(AluRegisters::OPCODE_LOAD, AluRegisters::R_SRCA, AluRegisters::R_2);
aluHelper.setNextAlu(AluRegisters::OPCODE_LOAD, AluRegisters::R_SRCB, AluRegisters::R_6);
aluHelper.setNextAlu(AluRegisters::OPCODE_SHL);
aluHelper.setNextAlu(AluRegisters::OPCODE_STORE, AluRegisters::R_7, AluRegisters::R_ACCU);
aluHelper.setNextAlu(AluRegisters::OPCODE_LOAD, AluRegisters::R_SRCA, AluRegisters::R_7);
aluHelper.setNextAlu(AluRegisters::OPCODE_LOAD, AluRegisters::R_SRCB, AluRegisters::R_8);
aluHelper.setNextAlu(AluRegisters::OPCODE_ADD);
aluHelper.setNextAlu(AluRegisters::OPCODE_STORE, AluRegisters::R_6, AluRegisters::R_ACCU);
aluHelper.setNextAlu(AluRegisters::OPCODE_LOADIND, AluRegisters::R_0, AluRegisters::R_ACCU);
aluHelper.setNextAlu(AluRegisters::OPCODE_FENCE_RD);
aluHelper.copyToCmdStream(ringCommandStream);
EncodeBatchBufferStartOrEnd<GfxFamily>::programBatchBufferStart(&ringCommandStream, 0, false, true, false);
}
// 3. Remove task section
{
EncodeMiPredicate<GfxFamily>::encode(ringCommandStream, MiPredicateType::Disable);
EncodeMathMMIO<GfxFamily>::encodeDecrement(ringCommandStream, AluRegisters::R_1);
EncodeMathMMIO<GfxFamily>::encodeDecrement(ringCommandStream, AluRegisters::R_2);
EncodeBatchBufferStartOrEnd<GfxFamily>::programConditionalDataRegBatchBufferStart(ringCommandStream,
schedulerStartAddress + RelaxedOrderingHelper::SchedulerSizeAndOffsetSection<GfxFamily>::semaphoreSectionStart,
CS_GPR_R1, 0, CompareOperation::Equal, false);
LriHelper<GfxFamily>::program(&ringCommandStream, CS_GPR_R7, 8, true);
LriHelper<GfxFamily>::program(&ringCommandStream, CS_GPR_R7 + 4, 0, true);
LriHelper<GfxFamily>::program(&ringCommandStream, CS_GPR_R8, static_cast<uint32_t>(deferredTasksListGpuVa & 0xFFFF'FFFFULL), true);
LriHelper<GfxFamily>::program(&ringCommandStream, CS_GPR_R8 + 4, static_cast<uint32_t>(deferredTasksListGpuVa >> 32), true);
EncodeAluHelper<GfxFamily, 14> aluHelper;
aluHelper.setNextAlu(AluRegisters::OPCODE_LOAD, AluRegisters::R_SRCA, AluRegisters::R_1);
aluHelper.setNextAlu(AluRegisters::OPCODE_LOAD, AluRegisters::R_SRCB, AluRegisters::R_7);
aluHelper.setNextAlu(AluRegisters::OPCODE_SHL);
aluHelper.setNextAlu(AluRegisters::OPCODE_STORE, AluRegisters::R_7, AluRegisters::R_ACCU);
aluHelper.setNextAlu(AluRegisters::OPCODE_LOAD, AluRegisters::R_SRCA, AluRegisters::R_7);
aluHelper.setNextAlu(AluRegisters::OPCODE_LOAD, AluRegisters::R_SRCB, AluRegisters::R_8);
aluHelper.setNextAlu(AluRegisters::OPCODE_ADD);
aluHelper.setNextAlu(AluRegisters::OPCODE_LOADIND, AluRegisters::R_7, AluRegisters::R_ACCU);
aluHelper.setNextAlu(AluRegisters::OPCODE_FENCE_RD);
aluHelper.setNextAlu(AluRegisters::OPCODE_LOAD, AluRegisters::R_SRCA, AluRegisters::R_6);
aluHelper.setNextAlu(AluRegisters::OPCODE_LOAD0, AluRegisters::R_SRCB, AluRegisters::OPCODE_NONE);
aluHelper.setNextAlu(AluRegisters::OPCODE_ADD);
aluHelper.setNextAlu(AluRegisters::OPCODE_STOREIND, AluRegisters::R_ACCU, AluRegisters::R_7);
aluHelper.setNextAlu(AluRegisters::OPCODE_FENCE_WR);
aluHelper.copyToCmdStream(ringCommandStream);
}
// 4. List loop check section
{
EncodeMiPredicate<GfxFamily>::encode(ringCommandStream, MiPredicateType::Disable);
EncodeMathMMIO<GfxFamily>::encodeIncrement(ringCommandStream, AluRegisters::R_2);
EncodeBatchBufferStartOrEnd<GfxFamily>::programConditionalRegRegBatchBufferStart(ringCommandStream,
schedulerStartAddress + RelaxedOrderingHelper::SchedulerSizeAndOffsetSection<GfxFamily>::loopStartSectionStart,
AluRegisters::R_1, AluRegisters::R_2, CompareOperation::NotEqual, false);
LriHelper<GfxFamily>::program(&ringCommandStream, CS_GPR_R2, 0, true);
LriHelper<GfxFamily>::program(&ringCommandStream, CS_GPR_R2 + 4, 0, true);
}
// 5. Drain request section
{
*ringCommandStream.getSpaceForCmd<typename GfxFamily::MI_ARB_CHECK>() = GfxFamily::cmdInitArbCheck;
EncodeBatchBufferStartOrEnd<GfxFamily>::programConditionalDataRegBatchBufferStart(ringCommandStream,
schedulerStartAddress + RelaxedOrderingHelper::SchedulerSizeAndOffsetSection<GfxFamily>::loopStartSectionStart,
CS_GPR_R5, 1, CompareOperation::Equal, false);
}
// 6. Scheduler loop check section
{
EncodeBatchBufferStartOrEnd<GfxFamily>::programConditionalDataMemBatchBufferStart(ringCommandStream, schedulerStartAddress + RelaxedOrderingHelper::SchedulerSizeAndOffsetSection<GfxFamily>::endSectionStart,
semaphoreGpuVa, value, CompareOperation::GreaterOrEqual, false);
EncodeBatchBufferStartOrEnd<GfxFamily>::programBatchBufferStart(&ringCommandStream, schedulerStartAddress + RelaxedOrderingHelper::SchedulerSizeAndOffsetSection<GfxFamily>::loopStartSectionStart, false, false, false);
}
// 7. Semaphore section
{
using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT;
using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION;
EncodeMiPredicate<GfxFamily>::encode(ringCommandStream, MiPredicateType::Disable);
EncodeSempahore<GfxFamily>::addMiSemaphoreWaitCommand(ringCommandStream,
semaphoreGpuVa,
value,
COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD);
}
// 8. End section
{
EncodeMiPredicate<GfxFamily>::encode(ringCommandStream, MiPredicateType::Disable);
LriHelper<GfxFamily>::program(&ringCommandStream, CS_GPR_R5, 0, true);
}
}
template <typename GfxFamily, typename Dispatcher>
DirectSubmissionHw<GfxFamily, Dispatcher>::~DirectSubmissionHw() = default;
@@ -213,7 +352,7 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::initialize(bool submitOnInit, bo
initDiagnostic(submitOnInit);
if (ret && submitOnInit) {
size_t startBufferSize = Dispatcher::getSizePreemption() +
getSizeSemaphoreSection();
getSizeSemaphoreSection(true);
Dispatcher::dispatchPreemption(ringCommandStream);
if (this->partitionedMode) {
@@ -230,13 +369,17 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::initialize(bool submitOnInit, bo
}
if (this->relaxedOrderingEnabled) {
preinitializeTaskStoreSection();
initRelaxedOrderingRegisters();
startBufferSize += RelaxedOrderingHelper::getSizeRegistersInit<GfxFamily>();
this->relaxedOrderingInitialized = true;
}
if (workloadMode == 1) {
dispatchDiagnosticModeSection();
startBufferSize += getDiagnosticModeSection();
}
dispatchSemaphoreSection(currentQueueWorkCount);
dispatchSemaphoreSection(currentQueueWorkCount, true);
ringStart = submit(ringCommandStream.getGraphicsAllocation()->getGpuAddress(), startBufferSize);
performDiagnosticMode();
@@ -251,13 +394,16 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::startRingBuffer() {
return true;
}
size_t startSize = getSizeSemaphoreSection();
size_t startSize = getSizeSemaphoreSection(true);
if (!this->partitionConfigSet) {
startSize += getSizePartitionRegisterConfigurationSection();
}
if (this->miMemFenceRequired && !this->systemMemoryFenceAddressSet) {
startSize += getSizeSystemMemoryFenceAddress();
}
if (this->relaxedOrderingEnabled && !this->relaxedOrderingInitialized) {
startSize += RelaxedOrderingHelper::getSizeRegistersInit<GfxFamily>();
}
size_t requiredSize = startSize + getSizeDispatch() + getSizeEnd();
if (ringCommandStream.getAvailableSpace() < requiredSize) {
@@ -277,11 +423,13 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::startRingBuffer() {
if (this->relaxedOrderingEnabled && !this->relaxedOrderingInitialized) {
preinitializeTaskStoreSection();
initRelaxedOrderingRegisters();
this->relaxedOrderingInitialized = true;
}
currentQueueWorkCount++;
dispatchSemaphoreSection(currentQueueWorkCount);
dispatchSemaphoreSection(currentQueueWorkCount, true);
ringStart = submit(gpuStartVa, startSize);
@@ -319,15 +467,20 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::stopRingBuffer() {
}
template <typename GfxFamily, typename Dispatcher>
inline void DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchSemaphoreSection(uint32_t value) {
inline void DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchSemaphoreSection(uint32_t value, bool firstSubmission) {
using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT;
using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION;
dispatchDisablePrefetcher(true);
EncodeSempahore<GfxFamily>::addMiSemaphoreWaitCommand(ringCommandStream,
semaphoreGpuVa,
value,
COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD);
if (this->relaxedOrderingEnabled && !firstSubmission) {
dispatchRelaxedOrderingSchedulerSection(value);
} else {
EncodeSempahore<GfxFamily>::addMiSemaphoreWaitCommand(ringCommandStream,
semaphoreGpuVa,
value,
COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD);
}
if (miMemFenceRequired) {
MemorySynchronizationCommands<GfxFamily>::addAdditionalSynchronizationForDirectSubmission(ringCommandStream, this->gpuVaForAdditionalSynchronizationWA, true, *hwInfo);
@@ -338,8 +491,9 @@ inline void DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchSemaphoreSection(
}
template <typename GfxFamily, typename Dispatcher>
inline size_t DirectSubmissionHw<GfxFamily, Dispatcher>::getSizeSemaphoreSection() {
size_t semaphoreSize = EncodeSempahore<GfxFamily>::getSizeMiSemaphoreWait();
inline size_t DirectSubmissionHw<GfxFamily, Dispatcher>::getSizeSemaphoreSection(bool firstSubmission) {
size_t semaphoreSize = (this->relaxedOrderingEnabled && !firstSubmission) ? RelaxedOrderingHelper::SchedulerSizeAndOffsetSection<GfxFamily>::totalSize
: EncodeSempahore<GfxFamily>::getSizeMiSemaphoreWait();
semaphoreSize += getSizePrefetchMitigation();
if (isDisablePrefetcherRequired) {
@@ -405,7 +559,7 @@ inline uint64_t DirectSubmissionHw<GfxFamily, Dispatcher>::getCommandBufferPosit
template <typename GfxFamily, typename Dispatcher>
inline size_t DirectSubmissionHw<GfxFamily, Dispatcher>::getSizeDispatch() {
size_t size = getSizeSemaphoreSection();
size_t size = getSizeSemaphoreSection(false);
if (workloadMode == 0) {
size += getSizeStartSection();
} else if (workloadMode == 1) {
@@ -475,10 +629,18 @@ void *DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchWorkloadSection(BatchBu
this->useNotifyForPostSync, this->partitionedMode, this->dcFlushRequired);
}
dispatchSemaphoreSection(currentQueueWorkCount + 1);
dispatchSemaphoreSection(currentQueueWorkCount + 1, false);
return currentPosition;
}
template <typename GfxFamily, typename Dispatcher>
void DirectSubmissionHw<GfxFamily, Dispatcher>::initRelaxedOrderingRegisters() {
LriHelper<GfxFamily>::program(&ringCommandStream, CS_GPR_R1, 0, true);
LriHelper<GfxFamily>::program(&ringCommandStream, CS_GPR_R1 + 4, 0, true);
LriHelper<GfxFamily>::program(&ringCommandStream, CS_GPR_R5, 0, true);
LriHelper<GfxFamily>::program(&ringCommandStream, CS_GPR_R5 + 4, 0, true);
}
template <typename GfxFamily, typename Dispatcher>
void DirectSubmissionHw<GfxFamily, Dispatcher>::preinitializeTaskStoreSection() {
preinitializedTaskStoreSection = std::make_unique<uint8_t[]>(RelaxedOrderingHelper::getSizeTaskStoreSection<GfxFamily>());

View File

@@ -21,5 +21,47 @@ constexpr size_t getSizeTaskStoreSection() {
EncodeMiPredicate<GfxFamily>::getCmdSize());
}
template <typename GfxFamily>
constexpr size_t getSizeRegistersInit() {
return (4 * sizeof(typename GfxFamily::MI_LOAD_REGISTER_IMM));
}
template <typename GfxFamily>
struct SchedulerSizeAndOffsetSection {
using MI_MATH = typename GfxFamily::MI_MATH;
using MI_MATH_ALU_INST_INLINE = typename GfxFamily::MI_MATH_ALU_INST_INLINE;
using MI_LOAD_REGISTER_IMM = typename GfxFamily::MI_LOAD_REGISTER_IMM;
using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START;
static constexpr uint64_t initSectionSize = EncodeBatchBufferStartOrEnd<GfxFamily>::getCmdSizeConditionalDataRegBatchBufferStart() + (6 * sizeof(MI_LOAD_REGISTER_IMM)) +
EncodeMiPredicate<GfxFamily>::getCmdSize();
static constexpr uint64_t loopStartSectionStart = initSectionSize;
static constexpr uint64_t loopStartSectionSize = (4 * sizeof(MI_LOAD_REGISTER_IMM)) + EncodeAluHelper<GfxFamily, 10>::getCmdsSize() + sizeof(MI_BATCH_BUFFER_START) +
EncodeMiPredicate<GfxFamily>::getCmdSize();
static constexpr uint64_t removeTaskSectionStart = loopStartSectionStart + loopStartSectionSize;
static constexpr uint64_t removeStartSectionSize = (2 * EncodeMathMMIO<GfxFamily>::getCmdSizeForIncrementOrDecrement()) + EncodeBatchBufferStartOrEnd<GfxFamily>::getCmdSizeConditionalDataRegBatchBufferStart() +
(4 * sizeof(MI_LOAD_REGISTER_IMM)) + EncodeAluHelper<GfxFamily, 14>::getCmdsSize() + EncodeMiPredicate<GfxFamily>::getCmdSize();
static constexpr uint64_t tasksListLoopCheckSectionStart = removeTaskSectionStart + removeStartSectionSize;
static constexpr uint64_t tasksListLoopCheckSectionSize = EncodeMathMMIO<GfxFamily>::getCmdSizeForIncrementOrDecrement() + EncodeBatchBufferStartOrEnd<GfxFamily>::getCmdSizeConditionalRegRegBatchBufferStart() +
(2 * sizeof(MI_LOAD_REGISTER_IMM)) + EncodeMiPredicate<GfxFamily>::getCmdSize();
static constexpr uint64_t drainRequestSectionStart = tasksListLoopCheckSectionStart + tasksListLoopCheckSectionSize;
static constexpr uint64_t drainRequestSectionSize = sizeof(typename GfxFamily::MI_ARB_CHECK) + EncodeBatchBufferStartOrEnd<GfxFamily>::getCmdSizeConditionalDataRegBatchBufferStart();
static constexpr uint64_t schedulerLoopCheckSectionStart = drainRequestSectionStart + drainRequestSectionSize;
static constexpr uint64_t schedulerLoopCheckSectionSize = EncodeBatchBufferStartOrEnd<GfxFamily>::getCmdSizeConditionalDataMemBatchBufferStart() + sizeof(MI_BATCH_BUFFER_START);
static constexpr uint64_t semaphoreSectionStart = schedulerLoopCheckSectionStart + schedulerLoopCheckSectionSize;
static constexpr uint64_t semaphoreSectionSize = EncodeSempahore<GfxFamily>::getSizeMiSemaphoreWait() + EncodeMiPredicate<GfxFamily>::getCmdSize();
static constexpr uint64_t endSectionStart = semaphoreSectionStart + semaphoreSectionSize;
static constexpr uint64_t endSectionSize = sizeof(MI_LOAD_REGISTER_IMM) + EncodeMiPredicate<GfxFamily>::getCmdSize();
static constexpr uint64_t totalSize = endSectionStart + endSectionSize;
};
} // namespace RelaxedOrderingHelper
} // namespace NEO