Introduce initial implementation of DirectSubmission relaxed ordering mode.

Initial implementation of task store section

Related-To: NEO-7458

Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com>
This commit is contained in:
Dunajski, Bartosz
2022-11-16 17:24:04 +00:00
committed by Compute-Runtime-Automation
parent 8927399cce
commit 89b96e5e8f
16 changed files with 393 additions and 5 deletions

View File

@@ -23,6 +23,7 @@ struct MockDirectSubmissionHw : public DirectSubmissionHw<GfxFamily, Dispatcher>
using BaseClass::currentRingBuffer;
using BaseClass::dcFlushRequired;
using BaseClass::deallocateResources;
using BaseClass::deferredTasksListAllocation;
using BaseClass::diagnostic;
using BaseClass::DirectSubmissionHw;
using BaseClass::disableCacheFlush;
@@ -53,6 +54,8 @@ struct MockDirectSubmissionHw : public DirectSubmissionHw<GfxFamily, Dispatcher>
using BaseClass::partitionedMode;
using BaseClass::performDiagnosticMode;
using BaseClass::postSyncOffset;
using BaseClass::preinitializedTaskStoreSection;
using BaseClass::relaxedOrderingInitialized;
using BaseClass::reserved;
using BaseClass::ringBuffers;
using BaseClass::ringCommandStream;
@@ -84,6 +87,11 @@ struct MockDirectSubmissionHw : public DirectSubmissionHw<GfxFamily, Dispatcher>
return allocateOsResourcesReturn;
}
void preinitializeTaskStoreSection() override {
preinitializeTaskStoreSectionCalled++;
BaseClass::preinitializeTaskStoreSection();
}
bool makeResourcesResident(DirectSubmissionAllocations &allocations) override {
makeResourcesResidentVectorSize = static_cast<uint32_t>(allocations.size());
if (callBaseResident) {
@@ -139,6 +147,7 @@ struct MockDirectSubmissionHw : public DirectSubmissionHw<GfxFamily, Dispatcher>
uint32_t submitCount = 0u;
uint32_t handleResidencyCount = 0u;
uint32_t disabledDiagnosticCalled = 0u;
uint32_t preinitializeTaskStoreSectionCalled = 0;
uint32_t makeResourcesResidentVectorSize = 0u;
bool allocateOsResourcesReturn = true;
bool submitReturn = true;

View File

@@ -486,4 +486,5 @@ AdjustThreadGroupDispatchSize = -1
ForceNonblockingExecbufferCalls = -1
UseHighAlignmentForHeapExtended = -1
ForceAutoGrfCompilationMode = -1
ForceComputeWalkerPostSyncFlush = -1
ForceComputeWalkerPostSyncFlush = -1
DirectSubmissionRelaxedOrdering = -1

View File

@@ -11,6 +11,7 @@
#include "shared/source/direct_submission/direct_submission_hw.h"
#include "shared/source/direct_submission/dispatchers/render_dispatcher.h"
#include "shared/source/helpers/flush_stamp.h"
#include "shared/source/helpers/register_offsets.h"
#include "shared/source/utilities/cpuintrinsics.h"
#include "shared/test/common/cmd_parse/hw_parse.h"
#include "shared/test/common/helpers/debug_manager_state_restore.h"
@@ -901,4 +902,227 @@ HWTEST_F(DirectSubmissionDispatchBufferTest, givenDebugFlagSetWhenStoppingRingbu
EXPECT_EQ(initialCounterValue + expectedCount, CpuIntrinsicsTests::sfenceCounter);
}
}
struct DirectSubmissionRelaxedOrderingTests : public DirectSubmissionDispatchBufferTest {
void SetUp() override {
DebugManager.flags.DirectSubmissionRelaxedOrdering.set(1);
DirectSubmissionDispatchBufferTest::SetUp();
}
DebugManagerStateRestore restore;
};
HWTEST_F(DirectSubmissionRelaxedOrderingTests, whenAllocatingResourcesThenCreateDeferredTasksAllocation) {
using Dispatcher = RenderDispatcher<FamilyType>;
auto mockMemoryOperations = new MockMemoryOperations();
mockMemoryOperations->captureGfxAllocationsForMakeResident = true;
pDevice->getRootDeviceEnvironmentRef().memoryOperationsInterface.reset(mockMemoryOperations);
MockDirectSubmissionHw<FamilyType, Dispatcher> directSubmission(*pDevice->getDefaultEngine().commandStreamReceiver);
directSubmission.callBaseResident = true;
directSubmission.initialize(false, false);
EXPECT_EQ(AllocationType::DEFERRED_TASKS_LIST, directSubmission.deferredTasksListAllocation->getAllocationType());
EXPECT_NE(nullptr, directSubmission.deferredTasksListAllocation);
EXPECT_EQ(directSubmission.deferredTasksListAllocation, mockMemoryOperations->gfxAllocationsForMakeResident.back());
}
HWTEST_F(DirectSubmissionRelaxedOrderingTests, whenInitializingThenPreinitializeTaskStoreSection) {
using Dispatcher = RenderDispatcher<FamilyType>;
{
MockDirectSubmissionHw<FamilyType, Dispatcher> directSubmission(*pDevice->getDefaultEngine().commandStreamReceiver);
directSubmission.initialize(false, false);
EXPECT_EQ(0u, directSubmission.preinitializeTaskStoreSectionCalled);
EXPECT_FALSE(directSubmission.relaxedOrderingInitialized);
EXPECT_EQ(nullptr, directSubmission.preinitializedTaskStoreSection.get());
}
{
MockDirectSubmissionHw<FamilyType, Dispatcher> directSubmission(*pDevice->getDefaultEngine().commandStreamReceiver);
directSubmission.initialize(true, false);
EXPECT_EQ(1u, directSubmission.preinitializeTaskStoreSectionCalled);
EXPECT_TRUE(directSubmission.relaxedOrderingInitialized);
EXPECT_NE(nullptr, directSubmission.preinitializedTaskStoreSection.get());
directSubmission.startRingBuffer();
EXPECT_EQ(1u, directSubmission.preinitializeTaskStoreSectionCalled);
}
{
MockDirectSubmissionHw<FamilyType, Dispatcher> directSubmission(*pDevice->getDefaultEngine().commandStreamReceiver);
directSubmission.initialize(false, false);
EXPECT_EQ(0u, directSubmission.preinitializeTaskStoreSectionCalled);
directSubmission.startRingBuffer();
EXPECT_EQ(1u, directSubmission.preinitializeTaskStoreSectionCalled);
EXPECT_TRUE(directSubmission.relaxedOrderingInitialized);
EXPECT_NE(nullptr, directSubmission.preinitializedTaskStoreSection.get());
directSubmission.startRingBuffer();
EXPECT_EQ(1u, directSubmission.preinitializeTaskStoreSectionCalled);
}
}
HWTEST_F(DirectSubmissionRelaxedOrderingTests, whenDispatchingWorkThenDispatchTaskStoreSection) {
using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM;
using MI_MATH_ALU_INST_INLINE = typename FamilyType::MI_MATH_ALU_INST_INLINE;
using MI_MATH = typename FamilyType::MI_MATH;
using Dispatcher = RenderDispatcher<FamilyType>;
MockDirectSubmissionHw<FamilyType, Dispatcher> directSubmission(*pDevice->getDefaultEngine().commandStreamReceiver);
directSubmission.initialize(true, false);
auto offset = directSubmission.ringCommandStream.getUsed() + directSubmission.getSizeStartSection();
FlushStampTracker flushStamp(true);
directSubmission.dispatchCommandBuffer(batchBuffer, flushStamp);
auto taskStoreSection = ptrOffset(directSubmission.ringCommandStream.getCpuBase(), offset);
if constexpr (FamilyType::isUsingMiSetPredicate) {
using MI_SET_PREDICATE = typename FamilyType::MI_SET_PREDICATE;
using PREDICATE_ENABLE = typename MI_SET_PREDICATE::PREDICATE_ENABLE;
auto miSetPredicate = reinterpret_cast<MI_SET_PREDICATE *>(taskStoreSection);
EXPECT_EQ(PREDICATE_ENABLE::PREDICATE_ENABLE_PREDICATE_DISABLE, miSetPredicate->getPredicateEnable());
taskStoreSection = ptrOffset(taskStoreSection, sizeof(MI_SET_PREDICATE));
}
uint64_t deferredTasksVa = directSubmission.deferredTasksListAllocation->getGpuAddress();
auto lriCmd = reinterpret_cast<MI_LOAD_REGISTER_IMM *>(taskStoreSection);
EXPECT_EQ(CS_GPR_R6, lriCmd->getRegisterOffset());
EXPECT_EQ(static_cast<uint32_t>(deferredTasksVa & 0xFFFF'FFFFULL), lriCmd->getDataDword());
lriCmd++;
EXPECT_EQ(CS_GPR_R6 + 4, lriCmd->getRegisterOffset());
EXPECT_EQ(static_cast<uint32_t>(deferredTasksVa >> 32), lriCmd->getDataDword());
lriCmd++;
EXPECT_EQ(CS_GPR_R7, lriCmd->getRegisterOffset());
EXPECT_EQ(0u, lriCmd->getDataDword());
lriCmd++;
EXPECT_EQ(CS_GPR_R7 + 4, lriCmd->getRegisterOffset());
EXPECT_EQ(0u, lriCmd->getDataDword());
lriCmd++;
EXPECT_EQ(CS_GPR_R8, lriCmd->getRegisterOffset());
EXPECT_EQ(8u, lriCmd->getDataDword());
lriCmd++;
EXPECT_EQ(CS_GPR_R8 + 4, lriCmd->getRegisterOffset());
EXPECT_EQ(0u, lriCmd->getDataDword());
auto miMathCmd = reinterpret_cast<MI_MATH *>(++lriCmd);
EXPECT_EQ(8u, miMathCmd->DW0.BitField.DwordLength);
auto miAluCmd = reinterpret_cast<MI_MATH_ALU_INST_INLINE *>(++miMathCmd);
EXPECT_EQ(static_cast<uint32_t>(AluRegisters::OPCODE_LOAD), miAluCmd->DW0.BitField.ALUOpcode);
EXPECT_EQ(static_cast<uint32_t>(AluRegisters::R_SRCA), miAluCmd->DW0.BitField.Operand1);
EXPECT_EQ(static_cast<uint32_t>(AluRegisters::R_1), miAluCmd->DW0.BitField.Operand2);
miAluCmd++;
EXPECT_EQ(static_cast<uint32_t>(AluRegisters::OPCODE_LOAD), miAluCmd->DW0.BitField.ALUOpcode);
EXPECT_EQ(static_cast<uint32_t>(AluRegisters::R_SRCB), miAluCmd->DW0.BitField.Operand1);
EXPECT_EQ(static_cast<uint32_t>(AluRegisters::R_8), miAluCmd->DW0.BitField.Operand2);
miAluCmd++;
EXPECT_EQ(static_cast<uint32_t>(AluRegisters::OPCODE_SHL), miAluCmd->DW0.BitField.ALUOpcode);
EXPECT_EQ(0u, miAluCmd->DW0.BitField.Operand1);
EXPECT_EQ(0u, miAluCmd->DW0.BitField.Operand2);
miAluCmd++;
EXPECT_EQ(static_cast<uint32_t>(AluRegisters::OPCODE_STORE), miAluCmd->DW0.BitField.ALUOpcode);
EXPECT_EQ(static_cast<uint32_t>(AluRegisters::R_8), miAluCmd->DW0.BitField.Operand1);
EXPECT_EQ(static_cast<uint32_t>(AluRegisters::R_ACCU), miAluCmd->DW0.BitField.Operand2);
miAluCmd++;
EXPECT_EQ(static_cast<uint32_t>(AluRegisters::OPCODE_LOAD), miAluCmd->DW0.BitField.ALUOpcode);
EXPECT_EQ(static_cast<uint32_t>(AluRegisters::R_SRCA), miAluCmd->DW0.BitField.Operand1);
EXPECT_EQ(static_cast<uint32_t>(AluRegisters::R_8), miAluCmd->DW0.BitField.Operand2);
miAluCmd++;
EXPECT_EQ(static_cast<uint32_t>(AluRegisters::OPCODE_LOAD), miAluCmd->DW0.BitField.ALUOpcode);
EXPECT_EQ(static_cast<uint32_t>(AluRegisters::R_SRCB), miAluCmd->DW0.BitField.Operand1);
EXPECT_EQ(static_cast<uint32_t>(AluRegisters::R_6), miAluCmd->DW0.BitField.Operand2);
miAluCmd++;
EXPECT_EQ(static_cast<uint32_t>(AluRegisters::OPCODE_ADD), miAluCmd->DW0.BitField.ALUOpcode);
EXPECT_EQ(0u, miAluCmd->DW0.BitField.Operand1);
EXPECT_EQ(0u, miAluCmd->DW0.BitField.Operand2);
miAluCmd++;
EXPECT_EQ(static_cast<uint32_t>(AluRegisters::OPCODE_STOREIND), miAluCmd->DW0.BitField.ALUOpcode);
EXPECT_EQ(static_cast<uint32_t>(AluRegisters::R_ACCU), miAluCmd->DW0.BitField.Operand1);
EXPECT_EQ(static_cast<uint32_t>(AluRegisters::R_7), miAluCmd->DW0.BitField.Operand2);
miAluCmd++;
EXPECT_EQ(static_cast<uint32_t>(AluRegisters::OPCODE_FENCE_WR), miAluCmd->DW0.BitField.ALUOpcode);
EXPECT_EQ(0u, miAluCmd->DW0.BitField.Operand1);
EXPECT_EQ(0u, miAluCmd->DW0.BitField.Operand2);
// increment
lriCmd = reinterpret_cast<MI_LOAD_REGISTER_IMM *>(++miAluCmd);
EXPECT_EQ(lriCmd->getRegisterOffset(), CS_GPR_R7);
EXPECT_EQ(lriCmd->getDataDword(), 1u);
lriCmd++;
EXPECT_EQ(CS_GPR_R7 + 4, lriCmd->getRegisterOffset());
EXPECT_EQ(0u, lriCmd->getDataDword());
miMathCmd = reinterpret_cast<MI_MATH *>(++lriCmd);
EXPECT_EQ(3u, miMathCmd->DW0.BitField.DwordLength);
miAluCmd = reinterpret_cast<MI_MATH_ALU_INST_INLINE *>(++miMathCmd);
EXPECT_EQ(static_cast<uint32_t>(AluRegisters::OPCODE_LOAD), miAluCmd->DW0.BitField.ALUOpcode);
EXPECT_EQ(static_cast<uint32_t>(AluRegisters::R_SRCA), miAluCmd->DW0.BitField.Operand1);
EXPECT_EQ(static_cast<uint32_t>(AluRegisters::R_1), miAluCmd->DW0.BitField.Operand2);
miAluCmd++;
EXPECT_EQ(static_cast<uint32_t>(AluRegisters::OPCODE_LOAD), miAluCmd->DW0.BitField.ALUOpcode);
EXPECT_EQ(static_cast<uint32_t>(AluRegisters::R_SRCB), miAluCmd->DW0.BitField.Operand1);
EXPECT_EQ(static_cast<uint32_t>(AluRegisters::R_7), miAluCmd->DW0.BitField.Operand2);
miAluCmd++;
EXPECT_EQ(static_cast<uint32_t>(AluRegisters::OPCODE_ADD), miAluCmd->DW0.BitField.ALUOpcode);
EXPECT_EQ(0u, miAluCmd->DW0.BitField.Operand1);
EXPECT_EQ(0u, miAluCmd->DW0.BitField.Operand2);
miAluCmd++;
EXPECT_EQ(static_cast<uint32_t>(AluRegisters::OPCODE_STORE), miAluCmd->DW0.BitField.ALUOpcode);
EXPECT_EQ(static_cast<uint32_t>(AluRegisters::R_1), miAluCmd->DW0.BitField.Operand1);
EXPECT_EQ(static_cast<uint32_t>(AluRegisters::R_ACCU), miAluCmd->DW0.BitField.Operand2);
}
HWTEST_F(DirectSubmissionRelaxedOrderingTests, givenNotEnoughSpaceForTaskStoreSectionWhenDispatchingThenSwitchRingBuffers) {
using Dispatcher = RenderDispatcher<FamilyType>;
MockDirectSubmissionHw<FamilyType, Dispatcher> directSubmission(*pDevice->getDefaultEngine().commandStreamReceiver);
directSubmission.initialize(true, false);
directSubmission.ringCommandStream.getUsed();
auto sizeToConsume = directSubmission.ringCommandStream.getAvailableSpace() -
(directSubmission.getSizeDispatch() + directSubmission.getSizeEnd() + directSubmission.getSizeSwitchRingBufferSection());
directSubmission.ringCommandStream.getSpace(sizeToConsume);
auto oldAllocation = directSubmission.ringCommandStream.getGraphicsAllocation();
FlushStampTracker flushStamp(true);
directSubmission.dispatchCommandBuffer(batchBuffer, flushStamp);
EXPECT_NE(oldAllocation, directSubmission.ringCommandStream.getGraphicsAllocation());
}

View File

@@ -104,7 +104,8 @@ AllocationTypeTagTestCase allocationTypeTagValues[static_cast<int>(AllocationTyp
{AllocationType::DEBUG_MODULE_AREA, "DBMDLARE"},
{AllocationType::UNIFIED_SHARED_MEMORY, "USHRDMEM"},
{AllocationType::GPU_TIMESTAMP_DEVICE_BUFFER, "GPUTSDBF"},
{AllocationType::SW_TAG_BUFFER, "SWTAGBF"}};
{AllocationType::SW_TAG_BUFFER, "SWTAGBF"},
{AllocationType::DEFERRED_TASKS_LIST, "TSKLIST"}};
class AllocationTypeTagString : public ::testing::TestWithParam<AllocationTypeTagTestCase> {};
TEST_P(AllocationTypeTagString, givenGraphicsAllocationTypeWhenCopyTagToStorageInfoThenCorrectTagIsReturned) {

View File

@@ -582,6 +582,14 @@ TEST(MemoryManagerTest, givenPreemptionTypeWhenGetAllocationDataIsCalledThen48Bi
EXPECT_TRUE(allocData.flags.resource48Bit);
}
TEST(MemoryManagerTest, givenDeferredTasksListTypeWhenGetAllocationDataIsCalledThen48BitResourceIsTrue) {
AllocationData allocData;
MockMemoryManager mockMemoryManager;
AllocationProperties properties{mockRootDeviceIndex, 1, AllocationType::DEFERRED_TASKS_LIST, mockDeviceBitfield};
mockMemoryManager.getAllocationData(allocData, properties, nullptr, mockMemoryManager.createStorageInfoFromProperties(properties));
EXPECT_TRUE(allocData.flags.resource48Bit);
}
TEST(MemoryManagerTest, givenSharedContextImageTypeWhenGetAllocationDataIsCalledThenSystemMemoryIsRequested) {
AllocationData allocData;
MockMemoryManager mockMemoryManager;
@@ -1094,6 +1102,7 @@ static const AllocationType allocationHaveToBeForcedTo48Bit[] = {
AllocationType::TIMESTAMP_PACKET_TAG_BUFFER,
AllocationType::RING_BUFFER,
AllocationType::SEMAPHORE_BUFFER,
AllocationType::DEFERRED_TASKS_LIST,
};
static const AllocationType allocationHaveNotToBeForcedTo48Bit[] = {

View File

@@ -212,6 +212,23 @@ TEST_F(MultiDeviceStorageInfoTest, givenSingleTileCsrWhenCreatingStorageInfoForP
EXPECT_EQ(singleTileMask, storageInfo.pageTablesVisibility);
}
TEST_F(MultiDeviceStorageInfoTest, givenMultiTileCsrWhenCreatingStorageInfoForDeferredTasksListAllocationThenAllMemoryBankAreOnAndPageTableClonningIsNotRequired) {
AllocationProperties properties{mockRootDeviceIndex, false, 0u, AllocationType::DEFERRED_TASKS_LIST, true, false, singleTileMask};
auto storageInfo = memoryManager->createStorageInfoFromProperties(properties);
EXPECT_FALSE(storageInfo.cloningOfPageTables);
EXPECT_TRUE(storageInfo.tileInstanced);
EXPECT_EQ(allTilesMask, storageInfo.memoryBanks);
EXPECT_EQ(allTilesMask, storageInfo.pageTablesVisibility);
}
TEST_F(MultiDeviceStorageInfoTest, givenSingleTileCsrWhenCreatingStorageInfoForDeferredTasksListAllocationThenSingleMemoryBankIsOnAndPageTableClonningIsRequired) {
AllocationProperties properties{mockRootDeviceIndex, false, 0u, AllocationType::DEFERRED_TASKS_LIST, false, false, singleTileMask};
auto storageInfo = memoryManager->createStorageInfoFromProperties(properties);
EXPECT_TRUE(storageInfo.cloningOfPageTables);
EXPECT_EQ(singleTileMask, storageInfo.memoryBanks);
EXPECT_EQ(singleTileMask, storageInfo.pageTablesVisibility);
}
TEST_F(MultiDeviceStorageInfoTest, whenCreatingStorageInfoForWorkPartitionSurfaceThenAllMemoryBankAreOnAndPageTableClonningIsNotRequired) {
AllocationProperties properties{mockRootDeviceIndex, false, 0u, AllocationType::WORK_PARTITION_SURFACE, true, false, singleTileMask};
auto storageInfo = memoryManager->createStorageInfoFromProperties(properties);