diff --git a/level_zero/core/source/device/device_imp.cpp b/level_zero/core/source/device/device_imp.cpp index c3f0e395ae..8428e3da5e 100644 --- a/level_zero/core/source/device/device_imp.cpp +++ b/level_zero/core/source/device/device_imp.cpp @@ -1416,8 +1416,12 @@ ze_result_t DeviceImp::getCsrForOrdinalAndIndex(NEO::CommandStreamReceiver **csr auto &osContext = (*csr)->getOsContext(); - if (neoDevice->getNumberOfRegularContextsPerEngine() > 1 && !osContext.isRootDevice() && NEO::EngineHelpers::isCcs(osContext.getEngineType())) { - *csr = neoDevice->getNextEngineForMultiRegularContextMode().commandStreamReceiver; + if (neoDevice->getNumberOfRegularContextsPerEngine() > 1 && !osContext.isRootDevice()) { + if (NEO::EngineHelpers::isCcs(osContext.getEngineType())) { + *csr = neoDevice->getNextEngineForMultiRegularContextMode(aub_stream::EngineType::ENGINE_CCS).commandStreamReceiver; + } else if (osContext.getEngineType() == aub_stream::EngineType::ENGINE_BCS) { + *csr = neoDevice->getNextEngineForMultiRegularContextMode(aub_stream::EngineType::ENGINE_BCS).commandStreamReceiver; + } } } else { auto subDeviceOrdinal = ordinal - numEngineGroups; diff --git a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_1.cpp b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_1.cpp index 1b944e35b6..ee3e5f2dfa 100644 --- a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_1.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_1.cpp @@ -944,31 +944,52 @@ TEST_F(DeviceCreateCommandQueueTest, givenNormalPriorityDescWhenCreateCommandQue struct CommandQueueCreateWithMultipleRegularContextsTests : public DeviceCreateCommandQueueTest { void SetUp() override { DebugManager.flags.NumberOfRegularContextsPerEngine.set(numberOfRegularContextsPerEngine); + DebugManager.flags.EnableMultipleRegularContextForBcs.set(1); DebugManager.flags.NodeOrdinal.set(static_cast(aub_stream::EngineType::ENGINE_CCS)); backupHwInfo = std::make_unique>(defaultHwInfo.get()); + defaultHwInfo->capabilityTable.blitterOperationsSupported = true; defaultHwInfo->featureTable.flags.ftrCCSNode = true; DeviceCreateCommandQueueTest::SetUp(); - if (device->getHwInfo().gtSystemInfo.CCSInfo.NumberOfCCSEnabled == 0) { + uint32_t regularCcsCount = 0; + uint32_t regularBcsCount = 0; + + for (auto &engine : device->getNEODevice()->getAllEngines()) { + if (engine.getEngineUsage() == EngineUsage::Regular) { + if (engine.getEngineType() == aub_stream::EngineType::ENGINE_CCS) { + regularCcsCount++; + } else if (engine.getEngineType() == aub_stream::EngineType::ENGINE_BCS) { + regularBcsCount++; + } + } + } + + if (regularCcsCount < numberOfRegularContextsPerEngine - 1 || regularBcsCount < numberOfRegularContextsPerEngine - 1) { GTEST_SKIP(); } auto &engineGroups = device->getNEODevice()->getRegularEngineGroups(); for (uint32_t i = 0; i < engineGroups.size(); i++) { - if (engineGroups[i].engineGroupType == EngineGroupType::Compute) { + if (engineGroups[i].engineGroupType == EngineGroupType::Compute && !computeOrdinalSet) { computeOrdinal = i; - break; + computeOrdinalSet = true; + } else if (engineGroups[i].engineGroupType == EngineGroupType::Copy && !copyOrdinalSet) { + copyOrdinal = i; + copyOrdinalSet = true; } } } std::unique_ptr> backupHwInfo; + DebugManagerStateRestore restore; const uint32_t numberOfRegularContextsPerEngine = 5; uint32_t computeOrdinal = 0; - DebugManagerStateRestore restore; + uint32_t copyOrdinal = 0; + bool computeOrdinalSet = false; + bool copyOrdinalSet = false; }; HWTEST_F(CommandQueueCreateWithMultipleRegularContextsTests, givenSupportedRequestWhenCreatingCommandQueueThenAssignNextAvailableContext) { @@ -989,6 +1010,25 @@ HWTEST_F(CommandQueueCreateWithMultipleRegularContextsTests, givenSupportedReque } } +HWTEST_F(CommandQueueCreateWithMultipleRegularContextsTests, givenSupportedRequestWhenCreatingBcsCommandQueueThenAssignNextAvailableContext) { + auto defaultBcsIndex = static_cast(device->getNEODevice())->defaultBcsEngineIndex; + uint32_t expectedIndex = defaultBcsIndex; + constexpr uint32_t iterationCount = 3; + + for (uint32_t i = 0; i < (numberOfRegularContextsPerEngine * iterationCount); i++) { + NEO::CommandStreamReceiver *csr = nullptr; + device->getCsrForOrdinalAndIndex(&csr, copyOrdinal, 0u); + ASSERT_NE(nullptr, csr); + + EXPECT_EQ(csr, device->getNEODevice()->getAllEngines()[expectedIndex].commandStreamReceiver); + + expectedIndex++; + if ((expectedIndex - defaultBcsIndex) == (numberOfRegularContextsPerEngine - 1)) { + expectedIndex = defaultBcsIndex; + } + } +} + TEST_F(DeviceCreateCommandQueueTest, whenCallingGetCsrForOrdinalAndIndexWithInvalidOrdinalThenInvalidArgumentIsReturned) { ze_command_queue_desc_t desc{}; diff --git a/opencl/source/command_queue/command_queue.cpp b/opencl/source/command_queue/command_queue.cpp index d92fd799b8..f94a55679c 100644 --- a/opencl/source/command_queue/command_queue.cpp +++ b/opencl/source/command_queue/command_queue.cpp @@ -179,7 +179,7 @@ void CommandQueue::initializeGpgpu() const { engineRoundRobinAvailable; if (device->getDevice().getNumberOfRegularContextsPerEngine() > 1) { - this->gpgpuEngine = &device->getDevice().getNextEngineForMultiRegularContextMode(); + this->gpgpuEngine = &device->getDevice().getNextEngineForMultiRegularContextMode(aub_stream::EngineType::ENGINE_CCS); } else if (assignEngineRoundRobin) { this->gpgpuEngine = &device->getDevice().getNextEngineForCommandQueue(); } else { diff --git a/shared/source/debug_settings/debug_variables_base.inl b/shared/source/debug_settings/debug_variables_base.inl index 3404972dab..e93e184c4b 100644 --- a/shared/source/debug_settings/debug_variables_base.inl +++ b/shared/source/debug_settings/debug_variables_base.inl @@ -225,6 +225,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, AdjustThreadGroupDispatchSize, -1, "-1: default, DECLARE_DEBUG_VARIABLE(int32_t, ForceNonblockingExecbufferCalls, -1, "-1: default, 0: make execbuffer call blocking, 1: make execbuffer call nonblocking. Supported only in prelim i915 kernels.") DECLARE_DEBUG_VARIABLE(int32_t, ForceComputeWalkerPostSyncFlush, -1, "-1: default, 0: disable 1: Enable all flushing bits in ComputeWalker->PostSync") DECLARE_DEBUG_VARIABLE(int32_t, NumberOfRegularContextsPerEngine, -1, "-1: default, >0: Create more than 1 Regular contexts for the same engine") +DECLARE_DEBUG_VARIABLE(int32_t, EnableMultipleRegularContextForBcs, -1, "-1: default, 0: disabled, 1: Use NumberOfRegularContextsPerEngine to create multiple Regular contexts on the same engine") DECLARE_DEBUG_VARIABLE(int32_t, AppendAubStreamContextFlags, -1, "-1: default, >0: Append flags passed during HardwareContext creation.") /*LOGGING FLAGS*/ diff --git a/shared/source/device/device.cpp b/shared/source/device/device.cpp index 3b6ff1c035..b7cab64358 100644 --- a/shared/source/device/device.cpp +++ b/shared/source/device/device.cpp @@ -396,6 +396,10 @@ bool Device::createEngine(uint32_t deviceCsrIndex, EngineTypeUsage engineTypeUsa } } + if (EngineHelpers::isBcs(engineType) && (defaultBcsEngineIndex == std::numeric_limits::max()) && (engineUsage == EngineUsage::Regular)) { + defaultBcsEngineIndex = deviceCsrIndex; + } + if (preemptionMode == PreemptionMode::MidThread && !commandStreamReceiver->createPreemptionAllocation()) { return false; } @@ -926,12 +930,23 @@ BuiltIns *Device::getBuiltIns() const { return executionEnvironment->rootDeviceEnvironments[getRootDeviceIndex()]->getBuiltIns(); } -EngineControl &Device::getNextEngineForMultiRegularContextMode() { +EngineControl &Device::getNextEngineForMultiRegularContextMode(aub_stream::EngineType engineType) { UNRECOVERABLE_IF(defaultEngineIndex != 0); + UNRECOVERABLE_IF((engineType != aub_stream::EngineType::ENGINE_BCS) && (engineType != aub_stream::EngineType::ENGINE_CCS)); - auto maxIndex = numberOfRegularContextsPerEngine - 1; // 1 for internal engine + const auto maxIndex = numberOfRegularContextsPerEngine - 1; // 1 for internal engine + uint32_t atomicOutValue = 0; + uint32_t indexOffset = 0; - auto indexToAssign = regularContextPerEngineAssignmentHelper.fetch_add(1) % maxIndex; + if (engineType == aub_stream::EngineType::ENGINE_CCS) { + atomicOutValue = regularContextPerCcsEngineAssignmentHelper.fetch_add(1); + indexOffset = defaultEngineIndex; + } else { + atomicOutValue = regularContextPerBcsEngineAssignmentHelper.fetch_add(1); + indexOffset = defaultBcsEngineIndex; + } + + auto indexToAssign = (atomicOutValue % maxIndex) + indexOffset; return allEngines[indexToAssign]; } diff --git a/shared/source/device/device.h b/shared/source/device/device.h index 1a6c2e35ad..bf3567e00d 100644 --- a/shared/source/device/device.h +++ b/shared/source/device/device.h @@ -81,7 +81,7 @@ class Device : public ReferenceTrackedObject { EngineControl &getEngine(uint32_t index); EngineControl &getDefaultEngine(); EngineControl &getNextEngineForCommandQueue(); - EngineControl &getNextEngineForMultiRegularContextMode(); + EngineControl &getNextEngineForMultiRegularContextMode(aub_stream::EngineType engineType); EngineControl &getInternalEngine(); EngineControl *getInternalCopyEngine(); SelectorCopyEngine &getSelectorCopyEngine(); @@ -203,9 +203,11 @@ class Device : public ReferenceTrackedObject { ExecutionEnvironment *executionEnvironment = nullptr; aub_stream::EngineType engineInstancedType = aub_stream::EngineType::NUM_ENGINES; uint32_t defaultEngineIndex = 0; + uint32_t defaultBcsEngineIndex = std::numeric_limits::max(); uint32_t numSubDevices = 0; std::atomic_uint32_t regularCommandQueuesCreatedWithinDeviceCount{0}; - std::atomic regularContextPerEngineAssignmentHelper = 0; + std::atomic regularContextPerCcsEngineAssignmentHelper = 0; + std::atomic regularContextPerBcsEngineAssignmentHelper = 0; std::bitset<8> availableEnginesForCommandQueueusRoundRobin = 0; uint32_t queuesPerEngineCount = 1; uint32_t numberOfRegularContextsPerEngine = 1; diff --git a/shared/test/common/mocks/mock_device.h b/shared/test/common/mocks/mock_device.h index e0a7228189..37dfc47248 100644 --- a/shared/test/common/mocks/mock_device.h +++ b/shared/test/common/mocks/mock_device.h @@ -52,6 +52,7 @@ class MockDevice : public RootDevice { using Device::createDeviceInternals; using Device::createEngine; using Device::createSubDevices; + using Device::defaultBcsEngineIndex; using Device::deviceBitfield; using Device::deviceInfo; using Device::engineInstanced; diff --git a/shared/test/common/test_files/igdrcl.config b/shared/test/common/test_files/igdrcl.config index defd5eca33..d8358b5808 100644 --- a/shared/test/common/test_files/igdrcl.config +++ b/shared/test/common/test_files/igdrcl.config @@ -495,4 +495,5 @@ OverrideUserFenceStartValue = -1 DirectSubmissionRelaxedOrderingQueueSizeLimit = -1 ExperimentalForceCopyThroughLock = -1 NumberOfRegularContextsPerEngine = -1 +EnableMultipleRegularContextForBcs = -1 AppendAubStreamContextFlags = -1 \ No newline at end of file diff --git a/shared/test/unit_test/device/neo_device_tests.cpp b/shared/test/unit_test/device/neo_device_tests.cpp index b8411df2c5..4caddd2583 100644 --- a/shared/test/unit_test/device/neo_device_tests.cpp +++ b/shared/test/unit_test/device/neo_device_tests.cpp @@ -715,3 +715,20 @@ TEST(FailDeviceTest, GivenMidThreadPreemptionAndFailedDeviceWhenCreatingDeviceTh EXPECT_EQ(nullptr, pDevice); } + +TEST_F(DeviceTests, whenInitializingDeviceThenSetCorrectDefaultBcsEngineIndex) { + if (!defaultHwInfo->capabilityTable.blitterOperationsSupported) { + GTEST_SKIP(); + } + + MockExecutionEnvironment executionEnvironment(defaultHwInfo.get()); + executionEnvironment.incRefInternal(); + + UltDeviceFactory deviceFactory{1, 0, executionEnvironment}; + + auto device = deviceFactory.rootDevices[0]; + auto &engine = device->allEngines[device->defaultBcsEngineIndex]; + + EXPECT_EQ(aub_stream::EngineType::ENGINE_BCS, engine.getEngineType()); + EXPECT_EQ(EngineUsage::Regular, engine.getEngineUsage()); +}