Fix direct submission wait on multi tile device using single tile context

Related-To: NEO-6244

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz
2021-10-26 11:53:01 +00:00
committed by Compute-Runtime-Automation
parent fe432abbb3
commit 60805cdbcf
6 changed files with 134 additions and 7 deletions

View File

@ -154,5 +154,6 @@ class DirectSubmissionHw {
bool disableCacheFlush = false;
bool disableMonitorFence = false;
bool partitionedMode = false;
bool partitionConfigSet = true;
};
} // namespace NEO

View File

@ -144,12 +144,12 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::initialize(bool submitOnInit) {
if (ret && submitOnInit) {
size_t startBufferSize = Dispatcher::getSizePreemption() +
getSizeSemaphoreSection();
if (this->partitionedMode) {
startBufferSize += EncodeSetMMIO<GfxFamily>::sizeMEM;
startBufferSize += EncodeSetMMIO<GfxFamily>::sizeIMM;
}
Dispatcher::dispatchPreemption(ringCommandStream);
if (this->partitionedMode) {
startBufferSize += (EncodeSetMMIO<GfxFamily>::sizeMEM +
EncodeSetMMIO<GfxFamily>::sizeIMM);
EncodeSetMMIO<GfxFamily>::encodeMEM(ringCommandStream,
PartitionRegisters<GfxFamily>::wparidCCSOffset,
this->workPartitionAllocation->getGpuAddress());
@ -157,6 +157,7 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::initialize(bool submitOnInit) {
PartitionRegisters<GfxFamily>::addressOffsetCCSOffset,
CommonConstants::partitionAddressOffset,
true);
this->partitionConfigSet = true;
}
if (workloadMode == 1) {
dispatchDiagnosticModeSection();
@ -178,12 +179,27 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::startRingBuffer() {
}
size_t startSize = getSizeSemaphoreSection();
if (!this->partitionConfigSet) {
startSize += (EncodeSetMMIO<GfxFamily>::sizeMEM +
EncodeSetMMIO<GfxFamily>::sizeIMM);
}
size_t requiredSize = startSize + getSizeDispatch() + getSizeEnd();
if (ringCommandStream.getAvailableSpace() < requiredSize) {
switchRingBuffers();
}
uint64_t gpuStartVa = getCommandBufferPositionGpuAddress(ringCommandStream.getSpace(0));
if (!this->partitionConfigSet) {
EncodeSetMMIO<GfxFamily>::encodeMEM(ringCommandStream,
PartitionRegisters<GfxFamily>::wparidCCSOffset,
this->workPartitionAllocation->getGpuAddress());
EncodeSetMMIO<GfxFamily>::encodeIMM(ringCommandStream,
PartitionRegisters<GfxFamily>::addressOffsetCCSOffset,
CommonConstants::partitionAddressOffset,
true);
this->partitionConfigSet = true;
}
currentQueueWorkCount++;
dispatchSemaphoreSection(currentQueueWorkCount);

View File

@ -29,13 +29,17 @@ DrmDirectSubmission<GfxFamily, Dispatcher>::DrmDirectSubmission(Device &device,
if (DebugManager.flags.DirectSubmissionDisableMonitorFence.get() != -1) {
this->disableMonitorFence = DebugManager.flags.DirectSubmissionDisableMonitorFence.get();
}
auto subDevices = device.getDeviceBitfield();
auto osContextLinux = static_cast<OsContextLinux *>(&this->osContext);
auto subDevices = osContextLinux->getDeviceBitfield();
bool dispatcherSupport = Dispatcher::isMultiTileSynchronizationSupported();
if (ImplicitScalingHelper::isImplicitScalingEnabled(subDevices, true) && dispatcherSupport) {
this->activeTiles = static_cast<uint32_t>(subDevices.count());
}
this->partitionedMode = this->activeTiles > 1u;
auto osContextLinux = static_cast<OsContextLinux *>(&this->osContext);
this->partitionConfigSet = !this->partitionedMode;
osContextLinux->getDrm().setDirectSubmissionActive(true);
if (this->partitionedMode) {

View File

@ -45,6 +45,7 @@ struct MockDirectSubmissionHw : public DirectSubmissionHw<GfxFamily, Dispatcher>
using BaseClass::getSizeSwitchRingBufferSection;
using BaseClass::hwInfo;
using BaseClass::osContext;
using BaseClass::partitionConfigSet;
using BaseClass::partitionedMode;
using BaseClass::performDiagnosticMode;
using BaseClass::ringBuffer;
@ -56,6 +57,7 @@ struct MockDirectSubmissionHw : public DirectSubmissionHw<GfxFamily, Dispatcher>
using BaseClass::semaphorePtr;
using BaseClass::semaphores;
using BaseClass::setReturnAddress;
using BaseClass::startRingBuffer;
using BaseClass::stopRingBuffer;
using BaseClass::switchRingBuffersAllocations;
using BaseClass::workloadMode;

View File

@ -44,12 +44,15 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, DirectSubmissionDispatchBufferTest,
MockDirectSubmissionHw<FamilyType, RenderDispatcher<FamilyType>> directSubmission(*pDevice,
*osContext.get());
EXPECT_TRUE(directSubmission.partitionConfigSet);
directSubmission.partitionConfigSet = false;
directSubmission.disableMonitorFence = false;
directSubmission.partitionedMode = true;
directSubmission.workPartitionAllocation = ultCsr->getWorkPartitionAllocation();
bool ret = directSubmission.initialize(true);
EXPECT_TRUE(ret);
EXPECT_TRUE(directSubmission.partitionConfigSet);
EXPECT_NE(0x0u, directSubmission.ringCommandStream.getUsed());
GraphicsAllocation *oldRingAllocation = directSubmission.ringCommandStream.getGraphicsAllocation();
@ -119,12 +122,15 @@ HWTEST_F(DirectSubmissionDispatchBufferTest,
MockDirectSubmissionHw<FamilyType, RenderDispatcher<FamilyType>> directSubmission(*pDevice,
*osContext.get());
EXPECT_TRUE(directSubmission.partitionConfigSet);
directSubmission.activeTiles = 2;
directSubmission.partitionedMode = true;
directSubmission.partitionConfigSet = false;
directSubmission.workPartitionAllocation = ultCsr->getWorkPartitionAllocation();
bool ret = directSubmission.initialize(true);
EXPECT_TRUE(ret);
EXPECT_TRUE(directSubmission.partitionConfigSet);
EXPECT_NE(0x0u, directSubmission.ringCommandStream.getUsed());
size_t submitSize = RenderDispatcher<FamilyType>::getSizePreemption() +
@ -158,3 +164,60 @@ HWTEST_F(DirectSubmissionDispatchBufferTest,
uint64_t gpuAddress = ultCsr->getWorkPartitionAllocation()->getGpuAddress();
EXPECT_EQ(gpuAddress, loadRegisterMem->getMemoryAddress());
}
HWTEST_F(DirectSubmissionDispatchBufferTest,
givenDirectSubmissionRingNotStartOnInitWhenMultiTileSupportedThenExpectMultiTileConfigSetDuringExplicitRingStart) {
using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM;
using MI_LOAD_REGISTER_MEM = typename FamilyType::MI_LOAD_REGISTER_MEM;
pDevice->rootCsrCreated = true;
pDevice->numSubDevices = 2;
auto ultCsr = reinterpret_cast<UltCommandStreamReceiver<FamilyType> *>(pDevice->getDefaultEngine().commandStreamReceiver);
ultCsr->staticWorkPartitioningEnabled = true;
ultCsr->createWorkPartitionAllocation(*pDevice);
FlushStampTracker flushStamp(true);
MockDirectSubmissionHw<FamilyType, RenderDispatcher<FamilyType>> directSubmission(*pDevice,
*osContext.get());
EXPECT_TRUE(directSubmission.partitionConfigSet);
directSubmission.activeTiles = 2;
directSubmission.partitionedMode = true;
directSubmission.partitionConfigSet = false;
directSubmission.workPartitionAllocation = ultCsr->getWorkPartitionAllocation();
bool ret = directSubmission.initialize(false);
EXPECT_TRUE(ret);
EXPECT_FALSE(directSubmission.partitionConfigSet);
EXPECT_FALSE(directSubmission.ringStart);
EXPECT_EQ(0x0u, directSubmission.ringCommandStream.getUsed());
ret = directSubmission.startRingBuffer();
EXPECT_TRUE(ret);
EXPECT_TRUE(directSubmission.partitionConfigSet);
EXPECT_TRUE(directSubmission.ringStart);
HardwareParse hwParse;
hwParse.parseCommands<FamilyType>(directSubmission.ringCommandStream, 0);
hwParse.findHardwareCommands<FamilyType>();
ASSERT_NE(hwParse.lriList.end(), hwParse.lriList.begin());
bool partitionRegisterFound = false;
for (auto &it : hwParse.lriList) {
auto loadRegisterImm = reinterpret_cast<MI_LOAD_REGISTER_IMM *>(it);
if (loadRegisterImm->getRegisterOffset() == 0x23B4u) {
EXPECT_EQ(8u, loadRegisterImm->getDataDword());
partitionRegisterFound = true;
}
}
EXPECT_TRUE(partitionRegisterFound);
auto loadRegisterMemItor = find<MI_LOAD_REGISTER_MEM *>(hwParse.cmdList.begin(), hwParse.cmdList.end());
ASSERT_NE(hwParse.cmdList.end(), loadRegisterMemItor);
auto loadRegisterMem = reinterpret_cast<MI_LOAD_REGISTER_MEM *>(*loadRegisterMemItor);
EXPECT_EQ(0x221Cu, loadRegisterMem->getRegisterAddress());
uint64_t gpuAddress = ultCsr->getWorkPartitionAllocation()->getGpuAddress();
EXPECT_EQ(gpuAddress, loadRegisterMem->getMemoryAddress());
}

View File

@ -67,6 +67,7 @@ struct MockDrmDirectSubmission : public DrmDirectSubmission<GfxFamily, Dispatche
using BaseClass::handleNewResourcesSubmission;
using BaseClass::handleResidency;
using BaseClass::isNewResourceHandleNeeded;
using BaseClass::partitionConfigSet;
using BaseClass::partitionedMode;
using BaseClass::ringStart;
using BaseClass::submit;
@ -314,7 +315,8 @@ HWTEST_F(DrmDirectSubmissionTest, givenMultipleActiveTilesWhenWaitingForTagUpdat
EXPECT_EQ(2u, CpuIntrinsicsTests::pauseCounter);
}
HWTEST_F(DrmDirectSubmissionTest, givenRenderDispatcherAndMultiTileDeviceWhenCreatingDirectSubmissionThenExpectActiveTilesMatchSubDeviceCount) {
HWTEST_F(DrmDirectSubmissionTest,
givenRenderDispatcherAndMultiTileDeviceWhenCreatingDirectSubmissionUsingMultiTileContextThenExpectActiveTilesMatchSubDeviceCount) {
using Dispatcher = RenderDispatcher<FamilyType>;
VariableBackup<bool> backup(&ImplicitScaling::apiSupport, true);
@ -322,6 +324,12 @@ HWTEST_F(DrmDirectSubmissionTest, givenRenderDispatcherAndMultiTileDeviceWhenCre
device->rootCsrCreated = true;
device->numSubDevices = 2;
osContext = std::make_unique<OsContextLinux>(*executionEnvironment.rootDeviceEnvironments[0]->osInterface->getDriverModel()->as<Drm>(), 0u,
EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_RCS, EngineUsage::Regular},
PreemptionMode::ThreadGroup, device->getDeviceBitfield()));
osContext->ensureContextInitialized();
EXPECT_EQ(2u, osContext->getDeviceBitfield().count());
auto ultCsr = reinterpret_cast<UltCommandStreamReceiver<FamilyType> *>(device->getDefaultEngine().commandStreamReceiver);
ultCsr->staticWorkPartitioningEnabled = true;
ultCsr->createWorkPartitionAllocation(*device);
@ -331,6 +339,32 @@ HWTEST_F(DrmDirectSubmissionTest, givenRenderDispatcherAndMultiTileDeviceWhenCre
EXPECT_EQ(2u, directSubmission.activeTiles);
EXPECT_TRUE(directSubmission.partitionedMode);
EXPECT_FALSE(directSubmission.partitionConfigSet);
bool ret = directSubmission.allocateResources();
EXPECT_TRUE(ret);
}
HWTEST_F(DrmDirectSubmissionTest, givenRenderDispatcherAndMultiTileDeviceWhenCreatingDirectSubmissionSingleTileContextThenExpectActiveTilesEqualsSingleTile) {
using Dispatcher = RenderDispatcher<FamilyType>;
VariableBackup<bool> backup(&ImplicitScaling::apiSupport, true);
device->deviceBitfield.set(0b11);
device->rootCsrCreated = true;
device->numSubDevices = 2;
EXPECT_EQ(1u, osContext->getDeviceBitfield().count());
auto ultCsr = reinterpret_cast<UltCommandStreamReceiver<FamilyType> *>(device->getDefaultEngine().commandStreamReceiver);
ultCsr->staticWorkPartitioningEnabled = true;
ultCsr->createWorkPartitionAllocation(*device);
MockDrmDirectSubmission<FamilyType, Dispatcher> directSubmission(*device.get(),
*osContext.get());
EXPECT_EQ(1u, directSubmission.activeTiles);
EXPECT_FALSE(directSubmission.partitionedMode);
EXPECT_TRUE(directSubmission.partitionConfigSet);
bool ret = directSubmission.allocateResources();
EXPECT_TRUE(ret);
@ -342,11 +376,18 @@ HWTEST_F(DrmDirectSubmissionTest, givenBlitterDispatcherAndMultiTileDeviceWhenCr
VariableBackup<bool> backup(&ImplicitScaling::apiSupport, true);
device->deviceBitfield.set(0b11);
osContext = std::make_unique<OsContextLinux>(*executionEnvironment.rootDeviceEnvironments[0]->osInterface->getDriverModel()->as<Drm>(), 0u,
EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_RCS, EngineUsage::Regular},
PreemptionMode::ThreadGroup, device->getDeviceBitfield()));
osContext->ensureContextInitialized();
EXPECT_EQ(2u, osContext->getDeviceBitfield().count());
MockDrmDirectSubmission<FamilyType, Dispatcher> directSubmission(*device.get(),
*osContext.get());
EXPECT_EQ(1u, directSubmission.activeTiles);
EXPECT_FALSE(directSubmission.partitionedMode);
EXPECT_TRUE(directSubmission.partitionConfigSet);
bool ret = directSubmission.allocateResources();
EXPECT_TRUE(ret);