From 5f5a1a6f810284fdbb594cd2893596c3f584b642 Mon Sep 17 00:00:00 2001 From: Lukasz Jobczyk Date: Mon, 25 Apr 2022 11:06:41 +0000 Subject: [PATCH] Assign BCS at first blit enqueue Signed-off-by: Lukasz Jobczyk --- opencl/source/command_queue/command_queue.cpp | 42 +++++++++++----- opencl/source/command_queue/command_queue.h | 11 ++-- .../command_queue/command_queue_tests.cpp | 2 +- .../command_queue_tests_pvc_and_later.cpp | 50 +++++++++++++++++-- opencl/test/unit_test/event/event_tests.cpp | 1 + .../test/unit_test/mocks/mock_command_queue.h | 1 + .../test/unit_test/test_files/igdrcl.config | 1 + .../debug_settings/debug_variables_base.inl | 1 + 8 files changed, 87 insertions(+), 22 deletions(-) diff --git a/opencl/source/command_queue/command_queue.cpp b/opencl/source/command_queue/command_queue.cpp index 841f76d384..650c8b8fd9 100644 --- a/opencl/source/command_queue/command_queue.cpp +++ b/opencl/source/command_queue/command_queue.cpp @@ -77,19 +77,22 @@ CommandQueue::CommandQueue(Context *context, ClDevice *device, const cl_queue_pr UNRECOVERABLE_IF(gpgpuEngine->getEngineType() >= aub_stream::EngineType::NUM_ENGINES); - bool bcsAllowed = hwInfoConfig->isBlitterFullySupported(hwInfo) && - hwHelper.isSubDeviceEngineSupported(hwInfo, device->getDeviceBitfield(), aub_stream::EngineType::ENGINE_BCS); + bcsAllowed = hwInfoConfig->isBlitterFullySupported(hwInfo) && + hwHelper.isSubDeviceEngineSupported(hwInfo, device->getDeviceBitfield(), aub_stream::EngineType::ENGINE_BCS); if (bcsAllowed || gpgpuEngine->commandStreamReceiver->peekTimestampPacketWriteEnabled()) { timestampPacketContainer = std::make_unique(); deferredTimestampPackets = std::make_unique(); } - if (bcsAllowed) { - auto &neoDevice = device->getNearestGenericSubDevice(0)->getDevice(); - auto &selectorCopyEngine = neoDevice.getSelectorCopyEngine(); - auto bcsEngineType = EngineHelpers::getBcsEngineType(hwInfo, device->getDeviceBitfield(), selectorCopyEngine, internalUsage); - bcsEngines[EngineHelpers::getBcsIndex(bcsEngineType)] = neoDevice.tryGetEngine(bcsEngineType, EngineUsage::Regular); - bcsEngineTypes.push_back(bcsEngineType); + + auto deferCmdQBcsInitialization = true; + + if (DebugManager.flags.DeferCmdQBcsInitialization.get() != -1) { + deferCmdQBcsInitialization = DebugManager.flags.DeferCmdQBcsInitialization.get(); + } + + if (!deferCmdQBcsInitialization) { + this->initializeBcsEngine(internalUsage); } } @@ -134,7 +137,8 @@ CommandStreamReceiver &CommandQueue::getGpgpuCommandStreamReceiver() const { return *gpgpuEngine->commandStreamReceiver; } -CommandStreamReceiver *CommandQueue::getBcsCommandStreamReceiver(aub_stream::EngineType bcsEngineType) const { +CommandStreamReceiver *CommandQueue::getBcsCommandStreamReceiver(aub_stream::EngineType bcsEngineType) { + initializeBcsEngine(isSpecial()); const EngineControl *engine = this->bcsEngines[EngineHelpers::getBcsIndex(bcsEngineType)]; if (engine == nullptr) { return nullptr; @@ -143,7 +147,8 @@ CommandStreamReceiver *CommandQueue::getBcsCommandStreamReceiver(aub_stream::Eng } } -CommandStreamReceiver *CommandQueue::getBcsForAuxTranslation() const { +CommandStreamReceiver *CommandQueue::getBcsForAuxTranslation() { + initializeBcsEngine(isSpecial()); for (const EngineControl *engine : this->bcsEngines) { if (engine != nullptr) { return engine->commandStreamReceiver; @@ -152,7 +157,8 @@ CommandStreamReceiver *CommandQueue::getBcsForAuxTranslation() const { return nullptr; } -CommandStreamReceiver &CommandQueue::selectCsrForBuiltinOperation(const CsrSelectionArgs &args) const { +CommandStreamReceiver &CommandQueue::selectCsrForBuiltinOperation(const CsrSelectionArgs &args) { + initializeBcsEngine(isSpecial()); if (isCopyOnly) { return *getBcsCommandStreamReceiver(bcsEngineTypes[0]); } @@ -220,6 +226,17 @@ CommandStreamReceiver &CommandQueue::selectCsrForBuiltinOperation(const CsrSelec return *selectedCsr; } +void CommandQueue::initializeBcsEngine(bool internalUsage) { + if (bcsAllowed && !bcsInitialized) { + auto &neoDevice = device->getNearestGenericSubDevice(0)->getDevice(); + auto &selectorCopyEngine = neoDevice.getSelectorCopyEngine(); + auto bcsEngineType = EngineHelpers::getBcsEngineType(device->getHardwareInfo(), device->getDeviceBitfield(), selectorCopyEngine, internalUsage); + bcsEngines[EngineHelpers::getBcsIndex(bcsEngineType)] = neoDevice.tryGetEngine(bcsEngineType, EngineUsage::Regular); + bcsEngineTypes.push_back(bcsEngineType); + bcsInitialized = true; + } +} + Device &CommandQueue::getDevice() const noexcept { return device->getDevice(); } @@ -233,7 +250,7 @@ volatile uint32_t *CommandQueue::getHwTagAddress() const { return getGpgpuCommandStreamReceiver().getTagAddress(); } -bool CommandQueue::isCompleted(uint32_t gpgpuTaskCount, CopyEngineState bcsState) const { +bool CommandQueue::isCompleted(uint32_t gpgpuTaskCount, CopyEngineState bcsState) { DEBUG_BREAK_IF(getHwTag() == CompletionStamp::notReady); if (getGpgpuCommandStreamReceiver().testTaskCountReady(getHwTagAddress(), gpgpuTaskCount)) { @@ -981,6 +998,7 @@ void CommandQueue::overrideEngine(aub_stream::EngineType engineType, EngineUsage timestampPacketContainer = std::make_unique(); deferredTimestampPackets = std::make_unique(); isCopyOnly = true; + bcsInitialized = true; } else { gpgpuEngine = &device->getEngine(engineType, engineUsage); } diff --git a/opencl/source/command_queue/command_queue.h b/opencl/source/command_queue/command_queue.h index d47cf3e255..5ae4bfc01d 100644 --- a/opencl/source/command_queue/command_queue.h +++ b/opencl/source/command_queue/command_queue.h @@ -202,7 +202,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> { volatile uint32_t *getHwTagAddress() const; - bool isCompleted(uint32_t gpgpuTaskCount, CopyEngineState bcsState) const; + bool isCompleted(uint32_t gpgpuTaskCount, CopyEngineState bcsState); bool isWaitForTimestampsEnabled() const; virtual bool waitForTimestamps(Range copyEnginesToWait, uint32_t taskCount) = 0; @@ -223,9 +223,10 @@ class CommandQueue : public BaseObject<_cl_command_queue> { const cl_event *eventWaitList); MOCKABLE_VIRTUAL CommandStreamReceiver &getGpgpuCommandStreamReceiver() const; - MOCKABLE_VIRTUAL CommandStreamReceiver *getBcsCommandStreamReceiver(aub_stream::EngineType bcsEngineType) const; - CommandStreamReceiver *getBcsForAuxTranslation() const; - MOCKABLE_VIRTUAL CommandStreamReceiver &selectCsrForBuiltinOperation(const CsrSelectionArgs &args) const; + MOCKABLE_VIRTUAL CommandStreamReceiver *getBcsCommandStreamReceiver(aub_stream::EngineType bcsEngineType); + CommandStreamReceiver *getBcsForAuxTranslation(); + MOCKABLE_VIRTUAL CommandStreamReceiver &selectCsrForBuiltinOperation(const CsrSelectionArgs &args); + void initializeBcsEngine(bool internalUsage); Device &getDevice() const noexcept; ClDevice &getClDevice() const { return *device; } Context &getContext() const { return *context; } @@ -408,6 +409,8 @@ class CommandQueue : public BaseObject<_cl_command_queue> { bool perfCountersEnabled = false; bool isCopyOnly = false; + bool bcsAllowed = false; + bool bcsInitialized = false; LinearStream *commandStream = nullptr; diff --git a/opencl/test/unit_test/command_queue/command_queue_tests.cpp b/opencl/test/unit_test/command_queue/command_queue_tests.cpp index 3ce81eb004..1a4cc5c790 100644 --- a/opencl/test/unit_test/command_queue/command_queue_tests.cpp +++ b/opencl/test/unit_test/command_queue/command_queue_tests.cpp @@ -1067,7 +1067,7 @@ struct WaitUntilCompletionTests : public ::testing::Test { MyCmdQueue(Context *context, ClDevice *device) : CommandQueueHw(context, device, nullptr, false){}; - CommandStreamReceiver *getBcsCommandStreamReceiver(aub_stream::EngineType bcsEngineType) const override { + CommandStreamReceiver *getBcsCommandStreamReceiver(aub_stream::EngineType bcsEngineType) override { return bcsCsrToReturn; } diff --git a/opencl/test/unit_test/command_queue/command_queue_tests_pvc_and_later.cpp b/opencl/test/unit_test/command_queue/command_queue_tests_pvc_and_later.cpp index 962ed67d3a..1d29e7f24f 100644 --- a/opencl/test/unit_test/command_queue/command_queue_tests_pvc_and_later.cpp +++ b/opencl/test/unit_test/command_queue/command_queue_tests_pvc_and_later.cpp @@ -84,6 +84,46 @@ HWTEST2_F(CommandQueuePvcAndLaterTests, givenAdditionalBcsWhenCreatingCommandQue EXPECT_EQ(1u, queue->countBcsEngines()); } +HWTEST2_F(CommandQueuePvcAndLaterTests, givenDeferCmdQBcsInitializationEnabledWhenCreateCommandQueueThenBcsCountIsZero, IsAtLeastXeHpcCore) { + DebugManagerStateRestore restorer; + DebugManager.flags.DeferCmdQBcsInitialization.set(1u); + + HardwareInfo hwInfo = *defaultHwInfo; + hwInfo.featureTable.ftrBcsInfo = maxNBitValue(9); + hwInfo.capabilityTable.blitterOperationsSupported = true; + MockDevice *device = MockDevice::createWithNewExecutionEnvironment(&hwInfo, 0); + MockClDevice clDevice{device}; + cl_device_id clDeviceId = static_cast(&clDevice); + ClDeviceVector clDevices{&clDeviceId, 1u}; + cl_int retVal{}; + auto context = std::unique_ptr{Context::create(nullptr, clDevices, nullptr, nullptr, retVal)}; + EXPECT_EQ(CL_SUCCESS, retVal); + + auto queue = std::make_unique(*context); + + EXPECT_EQ(0u, queue->countBcsEngines()); +} + +HWTEST2_F(CommandQueuePvcAndLaterTests, givenDeferCmdQBcsInitializationDisabledWhenCreateCommandQueueThenBcsIsInitialized, IsAtLeastXeHpcCore) { + DebugManagerStateRestore restorer; + DebugManager.flags.DeferCmdQBcsInitialization.set(0u); + + HardwareInfo hwInfo = *defaultHwInfo; + hwInfo.featureTable.ftrBcsInfo = maxNBitValue(9); + hwInfo.capabilityTable.blitterOperationsSupported = true; + MockDevice *device = MockDevice::createWithNewExecutionEnvironment(&hwInfo, 0); + MockClDevice clDevice{device}; + cl_device_id clDeviceId = static_cast(&clDevice); + ClDeviceVector clDevices{&clDeviceId, 1u}; + cl_int retVal{}; + auto context = std::unique_ptr{Context::create(nullptr, clDevices, nullptr, nullptr, retVal)}; + EXPECT_EQ(CL_SUCCESS, retVal); + + auto queue = std::make_unique(*context); + + EXPECT_NE(0u, queue->countBcsEngines()); +} + HWTEST2_F(CommandQueuePvcAndLaterTests, givenQueueWithMainBcsIsReleasedWhenNewQueueIsCreatedThenMainBcsCanBeUsedAgain, IsAtLeastXeHpcCore) { HardwareInfo hwInfo = *defaultHwInfo; hwInfo.featureTable.ftrBcsInfo = maxNBitValue(9); @@ -102,9 +142,9 @@ HWTEST2_F(CommandQueuePvcAndLaterTests, givenQueueWithMainBcsIsReleasedWhenNewQu auto queue4 = std::make_unique(*context); EXPECT_EQ(aub_stream::EngineType::ENGINE_BCS, queue1->getBcsCommandStreamReceiver(aub_stream::ENGINE_BCS)->getOsContext().getEngineType()); - EXPECT_EQ(aub_stream::EngineType::ENGINE_BCS1, queue2->getBcsCommandStreamReceiver(aub_stream::ENGINE_BCS1)->getOsContext().getEngineType()); - EXPECT_EQ(aub_stream::EngineType::ENGINE_BCS2, queue3->getBcsCommandStreamReceiver(aub_stream::ENGINE_BCS2)->getOsContext().getEngineType()); - EXPECT_EQ(aub_stream::EngineType::ENGINE_BCS1, queue4->getBcsCommandStreamReceiver(aub_stream::ENGINE_BCS1)->getOsContext().getEngineType()); + EXPECT_EQ(aub_stream::EngineType::ENGINE_BCS2, queue2->getBcsCommandStreamReceiver(aub_stream::ENGINE_BCS2)->getOsContext().getEngineType()); + EXPECT_EQ(aub_stream::EngineType::ENGINE_BCS1, queue3->getBcsCommandStreamReceiver(aub_stream::ENGINE_BCS1)->getOsContext().getEngineType()); + EXPECT_EQ(aub_stream::EngineType::ENGINE_BCS2, queue4->getBcsCommandStreamReceiver(aub_stream::ENGINE_BCS2)->getOsContext().getEngineType()); // Releasing main BCS. Next creation should be able to grab it queue1.reset(); @@ -114,7 +154,7 @@ HWTEST2_F(CommandQueuePvcAndLaterTests, givenQueueWithMainBcsIsReleasedWhenNewQu // Releasing link BCS. Shouldn't change anything queue2.reset(); queue2 = std::make_unique(*context); - EXPECT_EQ(aub_stream::EngineType::ENGINE_BCS2, queue2->getBcsCommandStreamReceiver(aub_stream::ENGINE_BCS2)->getOsContext().getEngineType()); + EXPECT_EQ(aub_stream::EngineType::ENGINE_BCS1, queue2->getBcsCommandStreamReceiver(aub_stream::ENGINE_BCS1)->getOsContext().getEngineType()); } HWTEST2_F(CommandQueuePvcAndLaterTests, givenCooperativeEngineUsageHintAndCcsWhenCreatingCommandQueueThenCreateQueueWithCooperativeEngine, IsAtLeastXeHpcCore) { @@ -455,10 +495,10 @@ HWTEST2_F(BcsCsrSelectionCommandQueueTests, givenMultipleEnginesInQueueWhenSelec aub_stream::ENGINE_BCS7, aub_stream::ENGINE_BCS8, }); - EXPECT_EQ(queue->getBcsCommandStreamReceiver(aub_stream::ENGINE_BCS1), &queue->selectCsrForBuiltinOperation(args)); EXPECT_EQ(queue->getBcsCommandStreamReceiver(aub_stream::ENGINE_BCS2), &queue->selectCsrForBuiltinOperation(args)); EXPECT_EQ(queue->getBcsCommandStreamReceiver(aub_stream::ENGINE_BCS1), &queue->selectCsrForBuiltinOperation(args)); EXPECT_EQ(queue->getBcsCommandStreamReceiver(aub_stream::ENGINE_BCS2), &queue->selectCsrForBuiltinOperation(args)); + EXPECT_EQ(queue->getBcsCommandStreamReceiver(aub_stream::ENGINE_BCS1), &queue->selectCsrForBuiltinOperation(args)); } } diff --git a/opencl/test/unit_test/event/event_tests.cpp b/opencl/test/unit_test/event/event_tests.cpp index 5c8eef221f..cad94f4dc6 100644 --- a/opencl/test/unit_test/event/event_tests.cpp +++ b/opencl/test/unit_test/event/event_tests.cpp @@ -144,6 +144,7 @@ TEST(Event, givenBcsCsrSetInEventWhenPeekingBcsTaskCountThenReturnCorrectTaskCou new MockClDevice{MockDevice::createWithNewExecutionEnvironment(&hwInfo)}}; MockContext context{device.get()}; MockCommandQueue queue{context}; + queue.initializeBcsEngine(false); queue.updateBcsTaskCount(queue.bcsEngines[0]->getEngineType(), 19); Event event{&queue, CL_COMMAND_READ_BUFFER, 0, 0}; diff --git a/opencl/test/unit_test/mocks/mock_command_queue.h b/opencl/test/unit_test/mocks/mock_command_queue.h index 40ef5362d2..910a47cef9 100644 --- a/opencl/test/unit_test/mocks/mock_command_queue.h +++ b/opencl/test/unit_test/mocks/mock_command_queue.h @@ -257,6 +257,7 @@ class MockCommandQueueHw : public CommandQueueHw { MockCommandQueueHw(Context *context, ClDevice *device, cl_queue_properties *properties) : BaseClass(context, device, properties, false) { + this->initializeBcsEngine(false); } void clearBcsEngines() { diff --git a/opencl/test/unit_test/test_files/igdrcl.config b/opencl/test/unit_test/test_files/igdrcl.config index ae81663e7e..05a46e8098 100644 --- a/opencl/test/unit_test/test_files/igdrcl.config +++ b/opencl/test/unit_test/test_files/igdrcl.config @@ -382,6 +382,7 @@ ForceExtendedKernelIsaSize = -1 MakeIndirectAllocationsResidentAsPack = -1 MakeEachAllocationResident = -1 AssignBCSAtEnqueue = -1 +DeferCmdQBcsInitialization = -1 ReuseKernelBinaries = -1 EnableChipsetUniqueUUID = -1 ForceSimdMessageSizeInWalker = -1 diff --git a/shared/source/debug_settings/debug_variables_base.inl b/shared/source/debug_settings/debug_variables_base.inl index 80a5ff1204..3ba06e1581 100644 --- a/shared/source/debug_settings/debug_variables_base.inl +++ b/shared/source/debug_settings/debug_variables_base.inl @@ -267,6 +267,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, ResolveDependenciesViaPipeControls, -1, "-1: def DECLARE_DEBUG_VARIABLE(int32_t, MakeIndirectAllocationsResidentAsPack, -1, "-1: default, 0:disabled, 1: enabled. If enabled, driver handles all indirect allocations as one pack instead of making them resident individually.") DECLARE_DEBUG_VARIABLE(int32_t, MakeEachAllocationResident, -1, "-1: default, 0: disabled, 1: bind every allocation at creation time, 2: bind all created allocations in flush") DECLARE_DEBUG_VARIABLE(int32_t, AssignBCSAtEnqueue, -1, "-1: default, 0:disabled, 1: enabled.") +DECLARE_DEBUG_VARIABLE(int32_t, DeferCmdQBcsInitialization, -1, "-1: default, 0:disabled, 1: enabled.") DECLARE_DEBUG_VARIABLE(int32_t, ReuseKernelBinaries, -1, "-1: default, 0:disabled, 1: enabled. If enabled, driver reuses kernel binaries.") /*DIRECT SUBMISSION FLAGS*/