diff --git a/opencl/source/command_queue/command_queue.cpp b/opencl/source/command_queue/command_queue.cpp index 9ce41b0dba..56672904da 100644 --- a/opencl/source/command_queue/command_queue.cpp +++ b/opencl/source/command_queue/command_queue.cpp @@ -73,19 +73,22 @@ CommandQueue::CommandQueue(Context *context, ClDevice *device, const cl_queue_pr auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily); auto hwInfoConfig = HwInfoConfig::get(hwInfo.platform.eProductFamily); - bool bcsAllowed = hwInfoConfig->isBlitterFullySupported(hwInfo) && - hwHelper.isSubDeviceEngineSupported(hwInfo, device->getDeviceBitfield(), aub_stream::EngineType::ENGINE_BCS); + bcsAllowed = hwInfoConfig->isBlitterFullySupported(hwInfo) && + hwHelper.isSubDeviceEngineSupported(hwInfo, device->getDeviceBitfield(), aub_stream::EngineType::ENGINE_BCS); if (bcsAllowed || device->getDefaultEngine().commandStreamReceiver->peekTimestampPacketWriteEnabled()) { timestampPacketContainer = std::make_unique(); deferredTimestampPackets = std::make_unique(); } - if (bcsAllowed) { - auto &neoDevice = device->getNearestGenericSubDevice(0)->getDevice(); - auto &selectorCopyEngine = neoDevice.getSelectorCopyEngine(); - auto bcsEngineType = EngineHelpers::getBcsEngineType(hwInfo, device->getDeviceBitfield(), selectorCopyEngine, internalUsage); - bcsEngines[EngineHelpers::getBcsIndex(bcsEngineType)] = neoDevice.tryGetEngine(bcsEngineType, EngineUsage::Regular); - bcsEngineTypes.push_back(bcsEngineType); + + auto deferCmdQBcsInitialization = hwInfo.featureTable.ftrBcsInfo.count() > 1u; + + if (DebugManager.flags.DeferCmdQBcsInitialization.get() != -1) { + deferCmdQBcsInitialization = DebugManager.flags.DeferCmdQBcsInitialization.get(); + } + + if (!deferCmdQBcsInitialization) { + this->initializeBcsEngine(internalUsage); } } @@ -181,7 +184,8 @@ CommandStreamReceiver &CommandQueue::getGpgpuCommandStreamReceiver() const { return *gpgpuEngine->commandStreamReceiver; } -CommandStreamReceiver *CommandQueue::getBcsCommandStreamReceiver(aub_stream::EngineType bcsEngineType) const { +CommandStreamReceiver *CommandQueue::getBcsCommandStreamReceiver(aub_stream::EngineType bcsEngineType) { + initializeBcsEngine(isSpecial()); const EngineControl *engine = this->bcsEngines[EngineHelpers::getBcsIndex(bcsEngineType)]; if (engine == nullptr) { return nullptr; @@ -190,7 +194,8 @@ CommandStreamReceiver *CommandQueue::getBcsCommandStreamReceiver(aub_stream::Eng } } -CommandStreamReceiver *CommandQueue::getBcsForAuxTranslation() const { +CommandStreamReceiver *CommandQueue::getBcsForAuxTranslation() { + initializeBcsEngine(isSpecial()); for (const EngineControl *engine : this->bcsEngines) { if (engine != nullptr) { return engine->commandStreamReceiver; @@ -199,7 +204,8 @@ CommandStreamReceiver *CommandQueue::getBcsForAuxTranslation() const { return nullptr; } -CommandStreamReceiver &CommandQueue::selectCsrForBuiltinOperation(const CsrSelectionArgs &args) const { +CommandStreamReceiver &CommandQueue::selectCsrForBuiltinOperation(const CsrSelectionArgs &args) { + initializeBcsEngine(isSpecial()); if (isCopyOnly) { return *getBcsCommandStreamReceiver(bcsEngineTypes[0]); } @@ -267,6 +273,21 @@ CommandStreamReceiver &CommandQueue::selectCsrForBuiltinOperation(const CsrSelec return *selectedCsr; } +void CommandQueue::initializeBcsEngine(bool internalUsage) { + if (bcsAllowed && !bcsInitialized) { + auto &neoDevice = device->getNearestGenericSubDevice(0)->getDevice(); + auto &selectorCopyEngine = neoDevice.getSelectorCopyEngine(); + auto bcsEngineType = EngineHelpers::getBcsEngineType(device->getHardwareInfo(), device->getDeviceBitfield(), selectorCopyEngine, internalUsage); + bcsEngines[EngineHelpers::getBcsIndex(bcsEngineType)] = neoDevice.tryGetEngine(bcsEngineType, EngineUsage::Regular); + bcsEngineTypes.push_back(bcsEngineType); + bcsInitialized = true; + if (bcsEngines[EngineHelpers::getBcsIndex(bcsEngineType)]) { + bcsEngines[EngineHelpers::getBcsIndex(bcsEngineType)]->osContext->ensureContextInitialized(); + bcsEngines[EngineHelpers::getBcsIndex(bcsEngineType)]->commandStreamReceiver->initDirectSubmission(); + } + } +} + Device &CommandQueue::getDevice() const noexcept { return device->getDevice(); } @@ -280,7 +301,7 @@ volatile uint32_t *CommandQueue::getHwTagAddress() const { return getGpgpuCommandStreamReceiver().getTagAddress(); } -bool CommandQueue::isCompleted(uint32_t gpgpuTaskCount, CopyEngineState bcsState) const { +bool CommandQueue::isCompleted(uint32_t gpgpuTaskCount, CopyEngineState bcsState) { DEBUG_BREAK_IF(getHwTag() == CompletionStamp::notReady); if (getGpgpuCommandStreamReceiver().testTaskCountReady(getHwTagAddress(), gpgpuTaskCount)) { @@ -1028,6 +1049,7 @@ void CommandQueue::overrideEngine(aub_stream::EngineType engineType, EngineUsage timestampPacketContainer = std::make_unique(); deferredTimestampPackets = std::make_unique(); isCopyOnly = true; + bcsInitialized = true; } else { gpgpuEngine = &device->getEngine(engineType, engineUsage); } diff --git a/opencl/source/command_queue/command_queue.h b/opencl/source/command_queue/command_queue.h index 199ee50d53..26409b27a8 100644 --- a/opencl/source/command_queue/command_queue.h +++ b/opencl/source/command_queue/command_queue.h @@ -202,7 +202,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> { volatile uint32_t *getHwTagAddress() const; - bool isCompleted(uint32_t gpgpuTaskCount, CopyEngineState bcsState) const; + bool isCompleted(uint32_t gpgpuTaskCount, CopyEngineState bcsState); bool isWaitForTimestampsEnabled() const; virtual bool waitForTimestamps(Range copyEnginesToWait, uint32_t taskCount) = 0; @@ -225,9 +225,10 @@ class CommandQueue : public BaseObject<_cl_command_queue> { void initializeGpgpu() const; void initializeGpgpuInternals() const; MOCKABLE_VIRTUAL CommandStreamReceiver &getGpgpuCommandStreamReceiver() const; - MOCKABLE_VIRTUAL CommandStreamReceiver *getBcsCommandStreamReceiver(aub_stream::EngineType bcsEngineType) const; - CommandStreamReceiver *getBcsForAuxTranslation() const; - MOCKABLE_VIRTUAL CommandStreamReceiver &selectCsrForBuiltinOperation(const CsrSelectionArgs &args) const; + MOCKABLE_VIRTUAL CommandStreamReceiver *getBcsCommandStreamReceiver(aub_stream::EngineType bcsEngineType); + CommandStreamReceiver *getBcsForAuxTranslation(); + MOCKABLE_VIRTUAL CommandStreamReceiver &selectCsrForBuiltinOperation(const CsrSelectionArgs &args); + void initializeBcsEngine(bool internalUsage); Device &getDevice() const noexcept; ClDevice &getClDevice() const { return *device; } Context &getContext() const { return *context; } @@ -413,6 +414,8 @@ class CommandQueue : public BaseObject<_cl_command_queue> { bool perfCountersEnabled = false; bool isCopyOnly = false; + bool bcsAllowed = false; + bool bcsInitialized = false; LinearStream *commandStream = nullptr; diff --git a/opencl/test/unit_test/command_queue/command_queue_tests.cpp b/opencl/test/unit_test/command_queue/command_queue_tests.cpp index 3c5c22b8fa..78d4478bbd 100644 --- a/opencl/test/unit_test/command_queue/command_queue_tests.cpp +++ b/opencl/test/unit_test/command_queue/command_queue_tests.cpp @@ -1067,7 +1067,7 @@ struct WaitUntilCompletionTests : public ::testing::Test { MyCmdQueue(Context *context, ClDevice *device) : CommandQueueHw(context, device, nullptr, false){}; - CommandStreamReceiver *getBcsCommandStreamReceiver(aub_stream::EngineType bcsEngineType) const override { + CommandStreamReceiver *getBcsCommandStreamReceiver(aub_stream::EngineType bcsEngineType) override { return bcsCsrToReturn; } diff --git a/opencl/test/unit_test/command_queue/command_queue_tests_pvc_and_later.cpp b/opencl/test/unit_test/command_queue/command_queue_tests_pvc_and_later.cpp index 9f04352919..b93aae7397 100644 --- a/opencl/test/unit_test/command_queue/command_queue_tests_pvc_and_later.cpp +++ b/opencl/test/unit_test/command_queue/command_queue_tests_pvc_and_later.cpp @@ -84,6 +84,46 @@ HWTEST2_F(CommandQueuePvcAndLaterTests, givenAdditionalBcsWhenCreatingCommandQue EXPECT_EQ(1u, queue->countBcsEngines()); } +HWTEST2_F(CommandQueuePvcAndLaterTests, givenDeferCmdQBcsInitializationEnabledWhenCreateCommandQueueThenBcsCountIsZero, IsAtLeastXeHpcCore) { + DebugManagerStateRestore restorer; + DebugManager.flags.DeferCmdQBcsInitialization.set(1u); + + HardwareInfo hwInfo = *defaultHwInfo; + hwInfo.featureTable.ftrBcsInfo = maxNBitValue(9); + hwInfo.capabilityTable.blitterOperationsSupported = true; + MockDevice *device = MockDevice::createWithNewExecutionEnvironment(&hwInfo, 0); + MockClDevice clDevice{device}; + cl_device_id clDeviceId = static_cast(&clDevice); + ClDeviceVector clDevices{&clDeviceId, 1u}; + cl_int retVal{}; + auto context = std::unique_ptr{Context::create(nullptr, clDevices, nullptr, nullptr, retVal)}; + EXPECT_EQ(CL_SUCCESS, retVal); + + auto queue = std::make_unique(*context); + + EXPECT_EQ(0u, queue->countBcsEngines()); +} + +HWTEST2_F(CommandQueuePvcAndLaterTests, givenDeferCmdQBcsInitializationDisabledWhenCreateCommandQueueThenBcsIsInitialized, IsAtLeastXeHpcCore) { + DebugManagerStateRestore restorer; + DebugManager.flags.DeferCmdQBcsInitialization.set(0u); + + HardwareInfo hwInfo = *defaultHwInfo; + hwInfo.featureTable.ftrBcsInfo = maxNBitValue(9); + hwInfo.capabilityTable.blitterOperationsSupported = true; + MockDevice *device = MockDevice::createWithNewExecutionEnvironment(&hwInfo, 0); + MockClDevice clDevice{device}; + cl_device_id clDeviceId = static_cast(&clDevice); + ClDeviceVector clDevices{&clDeviceId, 1u}; + cl_int retVal{}; + auto context = std::unique_ptr{Context::create(nullptr, clDevices, nullptr, nullptr, retVal)}; + EXPECT_EQ(CL_SUCCESS, retVal); + + auto queue = std::make_unique(*context); + + EXPECT_NE(0u, queue->countBcsEngines()); +} + HWTEST2_F(CommandQueuePvcAndLaterTests, givenQueueWithMainBcsIsReleasedWhenNewQueueIsCreatedThenMainBcsCanBeUsedAgain, IsAtLeastXeHpcCore) { HardwareInfo hwInfo = *defaultHwInfo; hwInfo.featureTable.ftrBcsInfo = maxNBitValue(9); @@ -102,9 +142,9 @@ HWTEST2_F(CommandQueuePvcAndLaterTests, givenQueueWithMainBcsIsReleasedWhenNewQu auto queue4 = std::make_unique(*context); EXPECT_EQ(aub_stream::EngineType::ENGINE_BCS, queue1->getBcsCommandStreamReceiver(aub_stream::ENGINE_BCS)->getOsContext().getEngineType()); - EXPECT_EQ(aub_stream::EngineType::ENGINE_BCS1, queue2->getBcsCommandStreamReceiver(aub_stream::ENGINE_BCS1)->getOsContext().getEngineType()); - EXPECT_EQ(aub_stream::EngineType::ENGINE_BCS2, queue3->getBcsCommandStreamReceiver(aub_stream::ENGINE_BCS2)->getOsContext().getEngineType()); - EXPECT_EQ(aub_stream::EngineType::ENGINE_BCS1, queue4->getBcsCommandStreamReceiver(aub_stream::ENGINE_BCS1)->getOsContext().getEngineType()); + EXPECT_EQ(aub_stream::EngineType::ENGINE_BCS2, queue2->getBcsCommandStreamReceiver(aub_stream::ENGINE_BCS2)->getOsContext().getEngineType()); + EXPECT_EQ(aub_stream::EngineType::ENGINE_BCS1, queue3->getBcsCommandStreamReceiver(aub_stream::ENGINE_BCS1)->getOsContext().getEngineType()); + EXPECT_EQ(aub_stream::EngineType::ENGINE_BCS2, queue4->getBcsCommandStreamReceiver(aub_stream::ENGINE_BCS2)->getOsContext().getEngineType()); // Releasing main BCS. Next creation should be able to grab it queue1.reset(); @@ -114,7 +154,7 @@ HWTEST2_F(CommandQueuePvcAndLaterTests, givenQueueWithMainBcsIsReleasedWhenNewQu // Releasing link BCS. Shouldn't change anything queue2.reset(); queue2 = std::make_unique(*context); - EXPECT_EQ(aub_stream::EngineType::ENGINE_BCS2, queue2->getBcsCommandStreamReceiver(aub_stream::ENGINE_BCS2)->getOsContext().getEngineType()); + EXPECT_EQ(aub_stream::EngineType::ENGINE_BCS1, queue2->getBcsCommandStreamReceiver(aub_stream::ENGINE_BCS1)->getOsContext().getEngineType()); } HWTEST2_F(CommandQueuePvcAndLaterTests, givenCooperativeEngineUsageHintAndCcsWhenCreatingCommandQueueThenCreateQueueWithCooperativeEngine, IsAtLeastXeHpcCore) { @@ -491,10 +531,10 @@ HWTEST2_F(BcsCsrSelectionCommandQueueTests, givenMultipleEnginesInQueueWhenSelec aub_stream::ENGINE_BCS7, aub_stream::ENGINE_BCS8, }); - EXPECT_EQ(queue->getBcsCommandStreamReceiver(aub_stream::ENGINE_BCS1), &queue->selectCsrForBuiltinOperation(args)); EXPECT_EQ(queue->getBcsCommandStreamReceiver(aub_stream::ENGINE_BCS2), &queue->selectCsrForBuiltinOperation(args)); EXPECT_EQ(queue->getBcsCommandStreamReceiver(aub_stream::ENGINE_BCS1), &queue->selectCsrForBuiltinOperation(args)); EXPECT_EQ(queue->getBcsCommandStreamReceiver(aub_stream::ENGINE_BCS2), &queue->selectCsrForBuiltinOperation(args)); + EXPECT_EQ(queue->getBcsCommandStreamReceiver(aub_stream::ENGINE_BCS1), &queue->selectCsrForBuiltinOperation(args)); } } diff --git a/opencl/test/unit_test/event/event_tests.cpp b/opencl/test/unit_test/event/event_tests.cpp index 5c8eef221f..cad94f4dc6 100644 --- a/opencl/test/unit_test/event/event_tests.cpp +++ b/opencl/test/unit_test/event/event_tests.cpp @@ -144,6 +144,7 @@ TEST(Event, givenBcsCsrSetInEventWhenPeekingBcsTaskCountThenReturnCorrectTaskCou new MockClDevice{MockDevice::createWithNewExecutionEnvironment(&hwInfo)}}; MockContext context{device.get()}; MockCommandQueue queue{context}; + queue.initializeBcsEngine(false); queue.updateBcsTaskCount(queue.bcsEngines[0]->getEngineType(), 19); Event event{&queue, CL_COMMAND_READ_BUFFER, 0, 0}; diff --git a/opencl/test/unit_test/mocks/mock_command_queue.h b/opencl/test/unit_test/mocks/mock_command_queue.h index da2b628973..ac2ad190d2 100644 --- a/opencl/test/unit_test/mocks/mock_command_queue.h +++ b/opencl/test/unit_test/mocks/mock_command_queue.h @@ -257,6 +257,7 @@ class MockCommandQueueHw : public CommandQueueHw { MockCommandQueueHw(Context *context, ClDevice *device, cl_queue_properties *properties) : BaseClass(context, device, properties, false) { + this->initializeBcsEngine(false); } void clearBcsEngines() { diff --git a/opencl/test/unit_test/test_files/igdrcl.config b/opencl/test/unit_test/test_files/igdrcl.config index 129c5a0488..1f25ba0261 100644 --- a/opencl/test/unit_test/test_files/igdrcl.config +++ b/opencl/test/unit_test/test_files/igdrcl.config @@ -383,6 +383,7 @@ MakeIndirectAllocationsResidentAsPack = -1 MakeEachAllocationResident = -1 AssignBCSAtEnqueue = -1 DeferCmdQGpgpuInitialization = -1 +DeferCmdQBcsInitialization = -1 ReuseKernelBinaries = -1 EnableChipsetUniqueUUID = -1 ForceSimdMessageSizeInWalker = -1 diff --git a/shared/source/debug_settings/debug_variables_base.inl b/shared/source/debug_settings/debug_variables_base.inl index aafb01d705..5f2dda82c0 100644 --- a/shared/source/debug_settings/debug_variables_base.inl +++ b/shared/source/debug_settings/debug_variables_base.inl @@ -268,6 +268,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, MakeIndirectAllocationsResidentAsPack, -1, "-1: DECLARE_DEBUG_VARIABLE(int32_t, MakeEachAllocationResident, -1, "-1: default, 0: disabled, 1: bind every allocation at creation time, 2: bind all created allocations in flush") DECLARE_DEBUG_VARIABLE(int32_t, AssignBCSAtEnqueue, -1, "-1: default, 0:disabled, 1: enabled.") DECLARE_DEBUG_VARIABLE(int32_t, DeferCmdQGpgpuInitialization, -1, "-1: default, 0:disabled, 1: enabled.") +DECLARE_DEBUG_VARIABLE(int32_t, DeferCmdQBcsInitialization, -1, "-1: default, 0:disabled, 1: enabled.") DECLARE_DEBUG_VARIABLE(int32_t, ReuseKernelBinaries, -1, "-1: default, 0:disabled, 1: enabled. If enabled, driver reuses kernel binaries.") /*DIRECT SUBMISSION FLAGS*/