Capability to create multiple Regular BCS contexts per engine.

Related-To: NEO-7618

Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com>
This commit is contained in:
Dunajski, Bartosz
2023-01-19 08:57:27 +00:00
committed by Compute-Runtime-Automation
parent 286c672ef4
commit 5a5596957a
9 changed files with 93 additions and 12 deletions

View File

@@ -1416,8 +1416,12 @@ ze_result_t DeviceImp::getCsrForOrdinalAndIndex(NEO::CommandStreamReceiver **csr
auto &osContext = (*csr)->getOsContext(); auto &osContext = (*csr)->getOsContext();
if (neoDevice->getNumberOfRegularContextsPerEngine() > 1 && !osContext.isRootDevice() && NEO::EngineHelpers::isCcs(osContext.getEngineType())) { if (neoDevice->getNumberOfRegularContextsPerEngine() > 1 && !osContext.isRootDevice()) {
*csr = neoDevice->getNextEngineForMultiRegularContextMode().commandStreamReceiver; if (NEO::EngineHelpers::isCcs(osContext.getEngineType())) {
*csr = neoDevice->getNextEngineForMultiRegularContextMode(aub_stream::EngineType::ENGINE_CCS).commandStreamReceiver;
} else if (osContext.getEngineType() == aub_stream::EngineType::ENGINE_BCS) {
*csr = neoDevice->getNextEngineForMultiRegularContextMode(aub_stream::EngineType::ENGINE_BCS).commandStreamReceiver;
}
} }
} else { } else {
auto subDeviceOrdinal = ordinal - numEngineGroups; auto subDeviceOrdinal = ordinal - numEngineGroups;

View File

@@ -944,31 +944,52 @@ TEST_F(DeviceCreateCommandQueueTest, givenNormalPriorityDescWhenCreateCommandQue
struct CommandQueueCreateWithMultipleRegularContextsTests : public DeviceCreateCommandQueueTest { struct CommandQueueCreateWithMultipleRegularContextsTests : public DeviceCreateCommandQueueTest {
void SetUp() override { void SetUp() override {
DebugManager.flags.NumberOfRegularContextsPerEngine.set(numberOfRegularContextsPerEngine); DebugManager.flags.NumberOfRegularContextsPerEngine.set(numberOfRegularContextsPerEngine);
DebugManager.flags.EnableMultipleRegularContextForBcs.set(1);
DebugManager.flags.NodeOrdinal.set(static_cast<int32_t>(aub_stream::EngineType::ENGINE_CCS)); DebugManager.flags.NodeOrdinal.set(static_cast<int32_t>(aub_stream::EngineType::ENGINE_CCS));
backupHwInfo = std::make_unique<VariableBackup<HardwareInfo>>(defaultHwInfo.get()); backupHwInfo = std::make_unique<VariableBackup<HardwareInfo>>(defaultHwInfo.get());
defaultHwInfo->capabilityTable.blitterOperationsSupported = true;
defaultHwInfo->featureTable.flags.ftrCCSNode = true; defaultHwInfo->featureTable.flags.ftrCCSNode = true;
DeviceCreateCommandQueueTest::SetUp(); DeviceCreateCommandQueueTest::SetUp();
if (device->getHwInfo().gtSystemInfo.CCSInfo.NumberOfCCSEnabled == 0) { uint32_t regularCcsCount = 0;
uint32_t regularBcsCount = 0;
for (auto &engine : device->getNEODevice()->getAllEngines()) {
if (engine.getEngineUsage() == EngineUsage::Regular) {
if (engine.getEngineType() == aub_stream::EngineType::ENGINE_CCS) {
regularCcsCount++;
} else if (engine.getEngineType() == aub_stream::EngineType::ENGINE_BCS) {
regularBcsCount++;
}
}
}
if (regularCcsCount < numberOfRegularContextsPerEngine - 1 || regularBcsCount < numberOfRegularContextsPerEngine - 1) {
GTEST_SKIP(); GTEST_SKIP();
} }
auto &engineGroups = device->getNEODevice()->getRegularEngineGroups(); auto &engineGroups = device->getNEODevice()->getRegularEngineGroups();
for (uint32_t i = 0; i < engineGroups.size(); i++) { for (uint32_t i = 0; i < engineGroups.size(); i++) {
if (engineGroups[i].engineGroupType == EngineGroupType::Compute) { if (engineGroups[i].engineGroupType == EngineGroupType::Compute && !computeOrdinalSet) {
computeOrdinal = i; computeOrdinal = i;
break; computeOrdinalSet = true;
} else if (engineGroups[i].engineGroupType == EngineGroupType::Copy && !copyOrdinalSet) {
copyOrdinal = i;
copyOrdinalSet = true;
} }
} }
} }
std::unique_ptr<VariableBackup<HardwareInfo>> backupHwInfo; std::unique_ptr<VariableBackup<HardwareInfo>> backupHwInfo;
DebugManagerStateRestore restore;
const uint32_t numberOfRegularContextsPerEngine = 5; const uint32_t numberOfRegularContextsPerEngine = 5;
uint32_t computeOrdinal = 0; uint32_t computeOrdinal = 0;
DebugManagerStateRestore restore; uint32_t copyOrdinal = 0;
bool computeOrdinalSet = false;
bool copyOrdinalSet = false;
}; };
HWTEST_F(CommandQueueCreateWithMultipleRegularContextsTests, givenSupportedRequestWhenCreatingCommandQueueThenAssignNextAvailableContext) { HWTEST_F(CommandQueueCreateWithMultipleRegularContextsTests, givenSupportedRequestWhenCreatingCommandQueueThenAssignNextAvailableContext) {
@@ -989,6 +1010,25 @@ HWTEST_F(CommandQueueCreateWithMultipleRegularContextsTests, givenSupportedReque
} }
} }
HWTEST_F(CommandQueueCreateWithMultipleRegularContextsTests, givenSupportedRequestWhenCreatingBcsCommandQueueThenAssignNextAvailableContext) {
auto defaultBcsIndex = static_cast<MockDevice *>(device->getNEODevice())->defaultBcsEngineIndex;
uint32_t expectedIndex = defaultBcsIndex;
constexpr uint32_t iterationCount = 3;
for (uint32_t i = 0; i < (numberOfRegularContextsPerEngine * iterationCount); i++) {
NEO::CommandStreamReceiver *csr = nullptr;
device->getCsrForOrdinalAndIndex(&csr, copyOrdinal, 0u);
ASSERT_NE(nullptr, csr);
EXPECT_EQ(csr, device->getNEODevice()->getAllEngines()[expectedIndex].commandStreamReceiver);
expectedIndex++;
if ((expectedIndex - defaultBcsIndex) == (numberOfRegularContextsPerEngine - 1)) {
expectedIndex = defaultBcsIndex;
}
}
}
TEST_F(DeviceCreateCommandQueueTest, TEST_F(DeviceCreateCommandQueueTest,
whenCallingGetCsrForOrdinalAndIndexWithInvalidOrdinalThenInvalidArgumentIsReturned) { whenCallingGetCsrForOrdinalAndIndexWithInvalidOrdinalThenInvalidArgumentIsReturned) {
ze_command_queue_desc_t desc{}; ze_command_queue_desc_t desc{};

View File

@@ -179,7 +179,7 @@ void CommandQueue::initializeGpgpu() const {
engineRoundRobinAvailable; engineRoundRobinAvailable;
if (device->getDevice().getNumberOfRegularContextsPerEngine() > 1) { if (device->getDevice().getNumberOfRegularContextsPerEngine() > 1) {
this->gpgpuEngine = &device->getDevice().getNextEngineForMultiRegularContextMode(); this->gpgpuEngine = &device->getDevice().getNextEngineForMultiRegularContextMode(aub_stream::EngineType::ENGINE_CCS);
} else if (assignEngineRoundRobin) { } else if (assignEngineRoundRobin) {
this->gpgpuEngine = &device->getDevice().getNextEngineForCommandQueue(); this->gpgpuEngine = &device->getDevice().getNextEngineForCommandQueue();
} else { } else {

View File

@@ -225,6 +225,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, AdjustThreadGroupDispatchSize, -1, "-1: default,
DECLARE_DEBUG_VARIABLE(int32_t, ForceNonblockingExecbufferCalls, -1, "-1: default, 0: make execbuffer call blocking, 1: make execbuffer call nonblocking. Supported only in prelim i915 kernels.") DECLARE_DEBUG_VARIABLE(int32_t, ForceNonblockingExecbufferCalls, -1, "-1: default, 0: make execbuffer call blocking, 1: make execbuffer call nonblocking. Supported only in prelim i915 kernels.")
DECLARE_DEBUG_VARIABLE(int32_t, ForceComputeWalkerPostSyncFlush, -1, "-1: default, 0: disable 1: Enable all flushing bits in ComputeWalker->PostSync") DECLARE_DEBUG_VARIABLE(int32_t, ForceComputeWalkerPostSyncFlush, -1, "-1: default, 0: disable 1: Enable all flushing bits in ComputeWalker->PostSync")
DECLARE_DEBUG_VARIABLE(int32_t, NumberOfRegularContextsPerEngine, -1, "-1: default, >0: Create more than 1 Regular contexts for the same engine") DECLARE_DEBUG_VARIABLE(int32_t, NumberOfRegularContextsPerEngine, -1, "-1: default, >0: Create more than 1 Regular contexts for the same engine")
DECLARE_DEBUG_VARIABLE(int32_t, EnableMultipleRegularContextForBcs, -1, "-1: default, 0: disabled, 1: Use NumberOfRegularContextsPerEngine to create multiple Regular contexts on the same engine")
DECLARE_DEBUG_VARIABLE(int32_t, AppendAubStreamContextFlags, -1, "-1: default, >0: Append flags passed during HardwareContext creation.") DECLARE_DEBUG_VARIABLE(int32_t, AppendAubStreamContextFlags, -1, "-1: default, >0: Append flags passed during HardwareContext creation.")
/*LOGGING FLAGS*/ /*LOGGING FLAGS*/

View File

@@ -396,6 +396,10 @@ bool Device::createEngine(uint32_t deviceCsrIndex, EngineTypeUsage engineTypeUsa
} }
} }
if (EngineHelpers::isBcs(engineType) && (defaultBcsEngineIndex == std::numeric_limits<uint32_t>::max()) && (engineUsage == EngineUsage::Regular)) {
defaultBcsEngineIndex = deviceCsrIndex;
}
if (preemptionMode == PreemptionMode::MidThread && !commandStreamReceiver->createPreemptionAllocation()) { if (preemptionMode == PreemptionMode::MidThread && !commandStreamReceiver->createPreemptionAllocation()) {
return false; return false;
} }
@@ -926,12 +930,23 @@ BuiltIns *Device::getBuiltIns() const {
return executionEnvironment->rootDeviceEnvironments[getRootDeviceIndex()]->getBuiltIns(); return executionEnvironment->rootDeviceEnvironments[getRootDeviceIndex()]->getBuiltIns();
} }
EngineControl &Device::getNextEngineForMultiRegularContextMode() { EngineControl &Device::getNextEngineForMultiRegularContextMode(aub_stream::EngineType engineType) {
UNRECOVERABLE_IF(defaultEngineIndex != 0); UNRECOVERABLE_IF(defaultEngineIndex != 0);
UNRECOVERABLE_IF((engineType != aub_stream::EngineType::ENGINE_BCS) && (engineType != aub_stream::EngineType::ENGINE_CCS));
auto maxIndex = numberOfRegularContextsPerEngine - 1; // 1 for internal engine const auto maxIndex = numberOfRegularContextsPerEngine - 1; // 1 for internal engine
uint32_t atomicOutValue = 0;
uint32_t indexOffset = 0;
auto indexToAssign = regularContextPerEngineAssignmentHelper.fetch_add(1) % maxIndex; if (engineType == aub_stream::EngineType::ENGINE_CCS) {
atomicOutValue = regularContextPerCcsEngineAssignmentHelper.fetch_add(1);
indexOffset = defaultEngineIndex;
} else {
atomicOutValue = regularContextPerBcsEngineAssignmentHelper.fetch_add(1);
indexOffset = defaultBcsEngineIndex;
}
auto indexToAssign = (atomicOutValue % maxIndex) + indexOffset;
return allEngines[indexToAssign]; return allEngines[indexToAssign];
} }

View File

@@ -81,7 +81,7 @@ class Device : public ReferenceTrackedObject<Device> {
EngineControl &getEngine(uint32_t index); EngineControl &getEngine(uint32_t index);
EngineControl &getDefaultEngine(); EngineControl &getDefaultEngine();
EngineControl &getNextEngineForCommandQueue(); EngineControl &getNextEngineForCommandQueue();
EngineControl &getNextEngineForMultiRegularContextMode(); EngineControl &getNextEngineForMultiRegularContextMode(aub_stream::EngineType engineType);
EngineControl &getInternalEngine(); EngineControl &getInternalEngine();
EngineControl *getInternalCopyEngine(); EngineControl *getInternalCopyEngine();
SelectorCopyEngine &getSelectorCopyEngine(); SelectorCopyEngine &getSelectorCopyEngine();
@@ -203,9 +203,11 @@ class Device : public ReferenceTrackedObject<Device> {
ExecutionEnvironment *executionEnvironment = nullptr; ExecutionEnvironment *executionEnvironment = nullptr;
aub_stream::EngineType engineInstancedType = aub_stream::EngineType::NUM_ENGINES; aub_stream::EngineType engineInstancedType = aub_stream::EngineType::NUM_ENGINES;
uint32_t defaultEngineIndex = 0; uint32_t defaultEngineIndex = 0;
uint32_t defaultBcsEngineIndex = std::numeric_limits<uint32_t>::max();
uint32_t numSubDevices = 0; uint32_t numSubDevices = 0;
std::atomic_uint32_t regularCommandQueuesCreatedWithinDeviceCount{0}; std::atomic_uint32_t regularCommandQueuesCreatedWithinDeviceCount{0};
std::atomic<uint8_t> regularContextPerEngineAssignmentHelper = 0; std::atomic<uint8_t> regularContextPerCcsEngineAssignmentHelper = 0;
std::atomic<uint8_t> regularContextPerBcsEngineAssignmentHelper = 0;
std::bitset<8> availableEnginesForCommandQueueusRoundRobin = 0; std::bitset<8> availableEnginesForCommandQueueusRoundRobin = 0;
uint32_t queuesPerEngineCount = 1; uint32_t queuesPerEngineCount = 1;
uint32_t numberOfRegularContextsPerEngine = 1; uint32_t numberOfRegularContextsPerEngine = 1;

View File

@@ -52,6 +52,7 @@ class MockDevice : public RootDevice {
using Device::createDeviceInternals; using Device::createDeviceInternals;
using Device::createEngine; using Device::createEngine;
using Device::createSubDevices; using Device::createSubDevices;
using Device::defaultBcsEngineIndex;
using Device::deviceBitfield; using Device::deviceBitfield;
using Device::deviceInfo; using Device::deviceInfo;
using Device::engineInstanced; using Device::engineInstanced;

View File

@@ -495,4 +495,5 @@ OverrideUserFenceStartValue = -1
DirectSubmissionRelaxedOrderingQueueSizeLimit = -1 DirectSubmissionRelaxedOrderingQueueSizeLimit = -1
ExperimentalForceCopyThroughLock = -1 ExperimentalForceCopyThroughLock = -1
NumberOfRegularContextsPerEngine = -1 NumberOfRegularContextsPerEngine = -1
EnableMultipleRegularContextForBcs = -1
AppendAubStreamContextFlags = -1 AppendAubStreamContextFlags = -1

View File

@@ -715,3 +715,20 @@ TEST(FailDeviceTest, GivenMidThreadPreemptionAndFailedDeviceWhenCreatingDeviceTh
EXPECT_EQ(nullptr, pDevice); EXPECT_EQ(nullptr, pDevice);
} }
TEST_F(DeviceTests, whenInitializingDeviceThenSetCorrectDefaultBcsEngineIndex) {
if (!defaultHwInfo->capabilityTable.blitterOperationsSupported) {
GTEST_SKIP();
}
MockExecutionEnvironment executionEnvironment(defaultHwInfo.get());
executionEnvironment.incRefInternal();
UltDeviceFactory deviceFactory{1, 0, executionEnvironment};
auto device = deviceFactory.rootDevices[0];
auto &engine = device->allEngines[device->defaultBcsEngineIndex];
EXPECT_EQ(aub_stream::EngineType::ENGINE_BCS, engine.getEngineType());
EXPECT_EQ(EngineUsage::Regular, engine.getEngineUsage());
}