diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index df6e2a3635..08da25aac0 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -353,6 +353,8 @@ ze_result_t CommandListCoreFamily::initialize(Device *device, NEO enableCopyOperationOffload(); } + enableImmediateBcsSplit(); + return returnType; } diff --git a/level_zero/core/source/cmdlist/cmdlist_imp.cpp b/level_zero/core/source/cmdlist/cmdlist_imp.cpp index d3c0e0554b..9eb6591edd 100644 --- a/level_zero/core/source/cmdlist/cmdlist_imp.cpp +++ b/level_zero/core/source/cmdlist/cmdlist_imp.cpp @@ -257,10 +257,6 @@ CommandList *CommandList::createImmediate(uint32_t productFamily, Device *device commandList->isTbxMode = csr->isTbxMode(); commandList->commandListPreemptionMode = device->getDevicePreemptionMode(); - if (!internalUsage) { - commandList->isBcsSplitNeeded = deviceImp->bcsSplit->setupDevice(csr); - } - commandList->copyThroughLockedPtrEnabled = gfxCoreHelper.copyThroughLockedPtrEnabled(hwInfo, productHelper); const bool cmdListSupportsCopyOffload = commandList->isInOrderExecutionEnabled() && !productHelper.isDcFlushAllowed(); @@ -269,9 +265,17 @@ CommandList *CommandList::createImmediate(uint32_t productFamily, Device *device commandList->enableCopyOperationOffload(); } + commandList->enableImmediateBcsSplit(); + return commandList; } +void CommandListImp::enableImmediateBcsSplit() { + if (device->getNEODevice()->isBcsSplitSupported() && isImmediateType() && !internalUsage && !isBcsSplitNeeded) { + isBcsSplitNeeded = static_cast(getDevice())->bcsSplit->setupDevice(getCsr(false), isCopyOffloadEnabled()); + } +} + void CommandListImp::enableCopyOperationOffload() { if (isCopyOnly(false) || !static_cast(device)->tryGetCopyEngineOrdinal().has_value()) { return; diff --git a/level_zero/core/source/cmdlist/cmdlist_imp.h b/level_zero/core/source/cmdlist/cmdlist_imp.h index 9f10381009..f433b80df4 100644 --- a/level_zero/core/source/cmdlist/cmdlist_imp.h +++ b/level_zero/core/source/cmdlist/cmdlist_imp.h @@ -56,6 +56,7 @@ struct CommandListImp : public CommandList { uint64_t getInOrderExecDeviceGpuAddress() const; size_t getInOrderExecHostRequiredSize() const; uint64_t getInOrderExecHostGpuAddress() const; + void enableImmediateBcsSplit(); protected: std::shared_ptr inOrderExecInfo; diff --git a/level_zero/core/source/device/bcs_split.cpp b/level_zero/core/source/device/bcs_split.cpp index 6d5239bb88..548a4bd41d 100644 --- a/level_zero/core/source/device/bcs_split.cpp +++ b/level_zero/core/source/device/bcs_split.cpp @@ -19,7 +19,7 @@ namespace L0 { -bool BcsSplit::setupDevice(NEO::CommandStreamReceiver *csr) { +bool BcsSplit::setupDevice(NEO::CommandStreamReceiver *csr, bool copyOffloadEnabled) { auto &productHelper = this->device.getProductHelper(); auto bcsSplitSettings = productHelper.getBcsSplitSettings(this->device.getHwInfo()); @@ -29,12 +29,12 @@ bool BcsSplit::setupDevice(NEO::CommandStreamReceiver *csr) { // If expectedTileCount==1, route root device to Tile0, otherwise use all Tiles bool tileCountMatch = (bcsSplitSettings.requiredTileCount == 1) || (this->device.getNEODevice()->getNumSubDevices() == bcsSplitSettings.requiredTileCount); + bool engineMatch = (csr->getOsContext().getEngineType() == productHelper.getDefaultCopyEngine()); + if (copyOffloadEnabled && NEO::debugManager.flags.SplitBcsForCopyOffload.get() != 0) { + engineMatch = NEO::EngineHelpers::isComputeEngine(csr->getOsContext().getEngineType()); + } - auto initializeBcsSplit = this->device.getNEODevice()->isBcsSplitSupported() && - (csr->getOsContext().getEngineType() == productHelper.getDefaultCopyEngine()) && - tileCountMatch; - - if (!initializeBcsSplit) { + if (!(engineMatch && tileCountMatch)) { return false; } diff --git a/level_zero/core/source/device/bcs_split.h b/level_zero/core/source/device/bcs_split.h index ac365be76e..618320943b 100644 --- a/level_zero/core/source/device/bcs_split.h +++ b/level_zero/core/source/device/bcs_split.h @@ -169,7 +169,7 @@ struct BcsSplit { return result; } - bool setupDevice(NEO::CommandStreamReceiver *csr); + bool setupDevice(NEO::CommandStreamReceiver *csr, bool copyOffloadEnabled); void releaseResources(); std::vector &getCmdListsForSplit(NEO::TransferDirection direction); void setupEnginesMask(NEO::BcsSplitSettings &settings); diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_blit.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_blit.cpp index d56d657a19..8c66ac7713 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_blit.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_blit.cpp @@ -1110,13 +1110,38 @@ HWTEST2_F(AggregatedBcsSplitTests, givenLimitedEnginesCountWhenCreatingBcsSplitT BcsSplit bcsSplit(static_cast(*device)); - bcsSplit.setupDevice(cmdList->getCsr(false)); + bcsSplit.setupDevice(cmdList->getCsr(false), false); EXPECT_EQ(expectedEnginesCount, bcsSplit.cmdLists.size()); bcsSplit.releaseResources(); } +HWTEST2_F(AggregatedBcsSplitTests, givenCopyOffloadEnabledWhenCreatingCmdListThenEnableBcsSplit, IsAtLeastXeHpcCore) { + debugManager.flags.ForceCopyOperationOffloadForComputeCmdList.set(1); + + ze_result_t returnValue; + ze_command_queue_desc_t desc = { + .flags = ZE_COMMAND_QUEUE_FLAG_IN_ORDER, + }; + std::unique_ptr commandList1(CommandList::createImmediate(productFamily, device, &desc, false, NEO::EngineGroupType::compute, returnValue)); + auto mockCmdList1 = static_cast> *>(commandList1.get()); + + ASSERT_EQ(ZE_RESULT_SUCCESS, returnValue); + + EXPECT_NE(device->getProductHelper().isDcFlushAllowed(), commandList1->isCopyOffloadEnabled()); + EXPECT_EQ(commandList1->isCopyOffloadEnabled(), mockCmdList1->isBcsSplitNeeded); + + debugManager.flags.SplitBcsForCopyOffload.set(0); + + std::unique_ptr commandList2(CommandList::createImmediate(productFamily, device, &desc, false, NEO::EngineGroupType::compute, returnValue)); + auto mockCmdList2 = static_cast> *>(commandList2.get()); + + ASSERT_EQ(ZE_RESULT_SUCCESS, returnValue); + + EXPECT_FALSE(mockCmdList2->isBcsSplitNeeded); +} + HWTEST_F(AggregatedBcsSplitTests, givenTransferDirectionWhenAskingIfSplitIsNeededThenReturnCorrectValue) { debugManager.flags.SplitBcsTransferDirectionMask.set(-1); @@ -1133,7 +1158,7 @@ HWTEST2_F(AggregatedBcsSplitTests, givenPlatformSupporingAggregatedSplitModeWhen BcsSplit bcsSplit(static_cast(*device)); - bcsSplit.setupDevice(cmdList->getCsr(false)); + bcsSplit.setupDevice(cmdList->getCsr(false), false); EXPECT_EQ(device->getL0GfxCoreHelper().bcsSplitAggregatedModeEnabled(), bcsSplit.events.aggregatedEventsMode); diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_2.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_2.cpp index cbd8de9076..37634bbafc 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_2.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_2.cpp @@ -4042,7 +4042,7 @@ struct BcsSplitInOrderCmdListTests : public InOrderCmdListFixture { auto bcsSplit = static_cast(device)->bcsSplit.get(); - cmdList->isBcsSplitNeeded = bcsSplit->setupDevice(cmdList->getCsr(false)); + cmdList->isBcsSplitNeeded = bcsSplit->setupDevice(cmdList->getCsr(false), false); return cmdList; } diff --git a/shared/source/debug_settings/debug_variables_base.inl b/shared/source/debug_settings/debug_variables_base.inl index 76fa55be77..540c6d4158 100644 --- a/shared/source/debug_settings/debug_variables_base.inl +++ b/shared/source/debug_settings/debug_variables_base.inl @@ -417,6 +417,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, SplitBcsRequiredTileCount, -1, "-1: default, >=1 DECLARE_DEBUG_VARIABLE(int32_t, SplitBcsRequiredEnginesCount, -1, "-1: default, >=1: required copy engines count in given configuration to enable bcs split") DECLARE_DEBUG_VARIABLE(int32_t, SplitBcsAggregatedEventsMode, -1, "-1: default, 0: disabled, 1: enabled. If enabled, use Aggregated CB Events for all Split operations") DECLARE_DEBUG_VARIABLE(int32_t, SplitBcsTransferDirectionMask, -1, "-1: default, >0: TransferDirection enum mask, indicating supported directions") +DECLARE_DEBUG_VARIABLE(int32_t, SplitBcsForCopyOffload, -1, "-1: default, 0: disabled, 1: enable BCS split for immediate copy offload cmd lists") DECLARE_DEBUG_VARIABLE(int32_t, ReuseKernelBinaries, -1, "-1: default, 0:disabled, 1: enabled. If enabled, driver reuses kernel binaries.") DECLARE_DEBUG_VARIABLE(int32_t, SetAmountOfReusableAllocations, -1, "-1: default, 0:disabled, > 1: enabled. If enabled, driver will fill reusable allocation lists with given amount of command buffers and heaps at initialization of immediate command list.") DECLARE_DEBUG_VARIABLE(int32_t, SetAmountOfReusableAllocationsPerCmdQueue, -1, "-1: default, 0:disabled, > 1: enabled. If enabled, driver will fill reusable allocation lists with given amount of command buffers for each initialized opencl command queue.") diff --git a/shared/test/common/test_files/igdrcl.config b/shared/test/common/test_files/igdrcl.config index c4b15a9580..f78042939c 100644 --- a/shared/test/common/test_files/igdrcl.config +++ b/shared/test/common/test_files/igdrcl.config @@ -670,4 +670,5 @@ SplitBcsTransferDirectionMask = -1 EnableShareableWithoutNTHandle = -1 Disable2MBSizeAlignment = 0 InOrderCopyMiFlushSync = -1 +SplitBcsForCopyOffload = -1 # Please don't edit below this line