From 7eb91e3b04c5633f74e53611888be7bc198c91fe Mon Sep 17 00:00:00 2001 From: Lukasz Jobczyk Date: Mon, 13 Feb 2023 06:19:28 +0000 Subject: [PATCH] Split the L0 BCS split into D2H and H2D -use separate pair of engines for D2H and H2D transfers Related-To: NEO-7716 Signed-off-by: Lukasz Jobczyk --- level_zero/core/source/cmdlist/cmdlist_hw.h | 5 +- level_zero/core/source/cmdlist/cmdlist_hw.inl | 10 +- .../source/cmdlist/cmdlist_hw_immediate.inl | 17 ++-- level_zero/core/source/device/bcs_split.cpp | 32 +++++++ level_zero/core/source/device/bcs_split.h | 25 +++-- .../xe_hpc_core/test_cmdqueue_xe_hpc_core.cpp | 92 +++++++++++++++---- .../source/command_queue/csr_selection_args.h | 25 +---- shared/source/command_stream/CMakeLists.txt | 3 +- .../command_stream/transfer_direction.h | 33 +++++++ .../debug_settings/debug_variables_base.inl | 2 + shared/test/common/test_files/igdrcl.config | 2 + 11 files changed, 182 insertions(+), 64 deletions(-) create mode 100644 shared/source/command_stream/transfer_direction.h diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.h b/level_zero/core/source/cmdlist/cmdlist_hw.h index b8235820eb..ad67a35cf6 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw.h @@ -7,6 +7,7 @@ #pragma once +#include "shared/source/command_stream/transfer_direction.h" #include "shared/source/helpers/hw_mapper.h" #include "shared/source/helpers/pipe_control_args.h" #include "shared/source/helpers/vec.h" @@ -264,8 +265,8 @@ struct CommandListCoreFamily : CommandListImp { void clearCommandsToPatch(); size_t getTotalSizeForCopyRegion(const ze_copy_region_t *region, uint32_t pitch, uint32_t slicePitch); - bool isAppendSplitNeeded(void *dstPtr, const void *srcPtr, size_t size); - bool isAppendSplitNeeded(NEO::MemoryPool dstPool, NEO::MemoryPool srcPool, size_t size); + bool isAppendSplitNeeded(void *dstPtr, const void *srcPtr, size_t size, NEO::TransferDirection &directionOut); + bool isAppendSplitNeeded(NEO::MemoryPool dstPool, NEO::MemoryPool srcPool, size_t size, NEO::TransferDirection &directionOut); void applyMemoryRangesBarrier(uint32_t numRanges, const size_t *pRangeSizes, const void **pRanges); diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index 1fe7c6221d..fde026f4d3 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -2501,26 +2501,28 @@ inline size_t CommandListCoreFamily::getTotalSizeForCopyRegion(co } template -bool CommandListCoreFamily::isAppendSplitNeeded(void *dstPtr, const void *srcPtr, size_t size) { +bool CommandListCoreFamily::isAppendSplitNeeded(void *dstPtr, const void *srcPtr, size_t size, NEO::TransferDirection &directionOut) { NEO::SvmAllocationData *srcAllocData = nullptr; NEO::SvmAllocationData *dstAllocData = nullptr; bool srcAllocFound = this->device->getDriverHandle()->findAllocationDataForRange(const_cast(srcPtr), size, &srcAllocData); bool dstAllocFound = this->device->getDriverHandle()->findAllocationDataForRange(dstPtr, size, &dstAllocData); if (srcAllocFound && dstAllocFound) { - return this->isAppendSplitNeeded(dstAllocData->gpuAllocations.getDefaultGraphicsAllocation()->getMemoryPool(), srcAllocData->gpuAllocations.getDefaultGraphicsAllocation()->getMemoryPool(), size); + return this->isAppendSplitNeeded(dstAllocData->gpuAllocations.getDefaultGraphicsAllocation()->getMemoryPool(), srcAllocData->gpuAllocations.getDefaultGraphicsAllocation()->getMemoryPool(), size, directionOut); } return false; } template -inline bool CommandListCoreFamily::isAppendSplitNeeded(NEO::MemoryPool dstPool, NEO::MemoryPool srcPool, size_t size) { +inline bool CommandListCoreFamily::isAppendSplitNeeded(NEO::MemoryPool dstPool, NEO::MemoryPool srcPool, size_t size, NEO::TransferDirection &directionOut) { constexpr size_t minimalSizeForBcsSplit = 4 * MemoryConstants::megaByte; + directionOut = NEO::createTransferDirection(!NEO::MemoryPoolHelper::isSystemMemoryPool(srcPool), !NEO::MemoryPoolHelper::isSystemMemoryPool(dstPool)); + return this->isBcsSplitNeeded && size >= minimalSizeForBcsSplit && - (NEO::MemoryPoolHelper::isSystemMemoryPool(dstPool) || NEO::MemoryPoolHelper::isSystemMemoryPool(srcPool)); + directionOut != NEO::TransferDirection::LocalToLocal; } template diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl index 501c085e17..789c5de4c5 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl @@ -393,10 +393,11 @@ ze_result_t CommandListCoreFamilyImmediate::appendMemoryCopy( relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents); - auto isSplitNeeded = this->isAppendSplitNeeded(dstptr, srcptr, size); + NEO::TransferDirection direction; + auto isSplitNeeded = this->isAppendSplitNeeded(dstptr, srcptr, size, direction); if (isSplitNeeded) { relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(1); // split generates more than 1 event - ret = static_cast(this->device)->bcsSplit.appendSplitCall(this, dstptr, srcptr, size, hSignalEvent, numWaitEvents, phWaitEvents, true, relaxedOrderingDispatch, [&](void *dstptrParam, const void *srcptrParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) { + ret = static_cast(this->device)->bcsSplit.appendSplitCall(this, dstptr, srcptr, size, hSignalEvent, numWaitEvents, phWaitEvents, true, relaxedOrderingDispatch, direction, [&](void *dstptrParam, const void *srcptrParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) { return CommandListCoreFamily::appendMemoryCopy(dstptrParam, srcptrParam, sizeParam, hSignalEventParam, 0u, nullptr, relaxedOrderingDispatch); }); } else { @@ -429,10 +430,11 @@ ze_result_t CommandListCoreFamilyImmediate::appendMemoryCopyRegio relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents); - auto isSplitNeeded = this->isAppendSplitNeeded(dstPtr, srcPtr, this->getTotalSizeForCopyRegion(dstRegion, dstPitch, dstSlicePitch)); + NEO::TransferDirection direction; + auto isSplitNeeded = this->isAppendSplitNeeded(dstPtr, srcPtr, this->getTotalSizeForCopyRegion(dstRegion, dstPitch, dstSlicePitch), direction); if (isSplitNeeded) { relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(1); // split generates more than 1 event - ret = static_cast(this->device)->bcsSplit.appendSplitCall(this, dstRegion->originX, srcRegion->originX, dstRegion->width, hSignalEvent, numWaitEvents, phWaitEvents, true, relaxedOrderingDispatch, [&](uint32_t dstOriginXParam, uint32_t srcOriginXParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) { + ret = static_cast(this->device)->bcsSplit.appendSplitCall(this, dstRegion->originX, srcRegion->originX, dstRegion->width, hSignalEvent, numWaitEvents, phWaitEvents, true, relaxedOrderingDispatch, direction, [&](uint32_t dstOriginXParam, uint32_t srcOriginXParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) { ze_copy_region_t dstRegionLocal = {}; ze_copy_region_t srcRegionLocal = {}; memcpy(&dstRegionLocal, dstRegion, sizeof(ze_copy_region_t)); @@ -508,7 +510,8 @@ ze_result_t CommandListCoreFamilyImmediate::appendPageFaultCopy(N ze_result_t ret; - auto isSplitNeeded = this->isAppendSplitNeeded(dstAllocation->getMemoryPool(), srcAllocation->getMemoryPool(), size); + NEO::TransferDirection direction; + auto isSplitNeeded = this->isAppendSplitNeeded(dstAllocation->getMemoryPool(), srcAllocation->getMemoryPool(), size, direction); bool relaxedOrdering = false; @@ -516,11 +519,11 @@ ze_result_t CommandListCoreFamilyImmediate::appendPageFaultCopy(N relaxedOrdering = isRelaxedOrderingDispatchAllowed(1); // split generates more than 1 event uintptr_t dstAddress = static_cast(dstAllocation->getGpuAddress()); uintptr_t srcAddress = static_cast(srcAllocation->getGpuAddress()); - ret = static_cast(this->device)->bcsSplit.appendSplitCall(this, dstAddress, srcAddress, size, nullptr, 0u, nullptr, false, relaxedOrdering, [&](uintptr_t dstAddressParam, uintptr_t srcAddressParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) { + ret = static_cast(this->device)->bcsSplit.appendSplitCall(this, dstAddress, srcAddress, size, nullptr, 0u, nullptr, false, relaxedOrdering, direction, [&](uintptr_t dstAddressParam, uintptr_t srcAddressParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) { this->appendMemoryCopyBlit(dstAddressParam, dstAllocation, 0u, srcAddressParam, srcAllocation, 0u, sizeParam); - return this->appendSignalEvent(hSignalEventParam); + return CommandListCoreFamily::appendSignalEvent(hSignalEventParam); }); } else { ret = CommandListCoreFamily::appendPageFaultCopy(dstAllocation, srcAllocation, size, flushHost); diff --git a/level_zero/core/source/device/bcs_split.cpp b/level_zero/core/source/device/bcs_split.cpp index f4e36cfdb8..ce741eec42 100644 --- a/level_zero/core/source/device/bcs_split.cpp +++ b/level_zero/core/source/device/bcs_split.cpp @@ -66,6 +66,26 @@ bool BcsSplit::setupDevice(uint32_t productFamily, bool internalUsage, const ze_ this->cmdQs.push_back(commandQueue); } + if (NEO::DebugManager.flags.SplitBcsMaskH2D.get() > 0) { + this->h2dEngines = NEO::DebugManager.flags.SplitBcsMaskH2D.get(); + } + if (NEO::DebugManager.flags.SplitBcsMaskD2H.get() > 0) { + this->d2hEngines = NEO::DebugManager.flags.SplitBcsMaskD2H.get(); + } + + uint32_t cmdQIndex = 0u; + for (uint32_t i = 0; i < NEO::bcsInfoMaskSize; i++) { + if (this->engines.test(i)) { + if (this->h2dEngines.test(i)) { + this->h2dCmdQs.push_back(this->cmdQs[cmdQIndex]); + } + if (this->d2hEngines.test(i)) { + this->d2hCmdQs.push_back(this->cmdQs[cmdQIndex]); + } + cmdQIndex++; + } + } + return true; } @@ -78,10 +98,22 @@ void BcsSplit::releaseResources() { cmdQ->destroy(); } cmdQs.clear(); + d2hCmdQs.clear(); + h2dCmdQs.clear(); this->events.releaseResources(); } } +std::vector &BcsSplit::getCmdQsForSplit(NEO::TransferDirection direction) { + if (direction == NEO::TransferDirection::HostToLocal) { + return this->h2dCmdQs; + } else if (direction == NEO::TransferDirection::LocalToHost) { + return this->d2hCmdQs; + } + + return this->cmdQs; +} + size_t BcsSplit::Events::obtainForSplit(Context *context, size_t maxEventCountInPool) { for (size_t i = 0; i < this->marker.size(); i++) { auto ret = this->marker[i]->queryStatus(); diff --git a/level_zero/core/source/device/bcs_split.h b/level_zero/core/source/device/bcs_split.h index f373fe56e4..9fe81e9884 100644 --- a/level_zero/core/source/device/bcs_split.h +++ b/level_zero/core/source/device/bcs_split.h @@ -7,6 +7,7 @@ #pragma once +#include "shared/source/command_stream/transfer_direction.h" #include "shared/source/helpers/engine_node_helper.h" #include "shared/source/sku_info/sku_info_base.h" @@ -51,7 +52,14 @@ struct BcsSplit { } events; std::vector cmdQs; + std::vector h2dCmdQs; + std::vector d2hCmdQs; + + inline static constexpr size_t h2dEngineMask = 0b010001000; + inline static constexpr size_t d2hEngineMask = 0b000100010; NEO::BcsInfoMask engines = NEO::EngineHelpers::oddLinkedCopyEnginesMask; + NEO::BcsInfoMask h2dEngines = h2dEngineMask; + NEO::BcsInfoMask d2hEngines = d2hEngineMask; template ze_result_t appendSplitCall(CommandListCoreFamilyImmediate *cmdList, @@ -63,6 +71,7 @@ struct BcsSplit { ze_event_handle_t *phWaitEvents, bool performMigration, bool hasRelaxedOrderingDependencies, + NEO::TransferDirection direction, std::function appendCall) { ze_result_t result = ZE_RESULT_SUCCESS; @@ -76,9 +85,11 @@ struct BcsSplit { auto subcopyEventIndex = markerEventIndex * this->cmdQs.size(); StackVec eventHandles; + auto &cmdQsForSplit = this->getCmdQsForSplit(direction); + auto totalSize = size; - auto engineCount = this->cmdQs.size(); - for (size_t i = 0; i < this->cmdQs.size(); i++) { + auto engineCount = cmdQsForSplit.size(); + for (size_t i = 0; i < cmdQsForSplit.size(); i++) { if (barrierRequired) { auto barrierEventHandle = this->events.barrier[markerEventIndex]->toHandle(); cmdList->addEventsToCmdList(1u, &barrierEventHandle, hasRelaxedOrderingDependencies, false); @@ -97,9 +108,9 @@ struct BcsSplit { result = appendCall(localDstPtr, localSrcPtr, localSize, eventHandle); if (cmdList->isFlushTaskSubmissionEnabled) { - cmdList->executeCommandListImmediateWithFlushTaskImpl(performMigration, false, hasRelaxedOrderingDependencies, this->cmdQs[i]); + cmdList->executeCommandListImmediateWithFlushTaskImpl(performMigration, false, hasRelaxedOrderingDependencies, cmdQsForSplit[i]); } else { - cmdList->executeCommandListImmediateImpl(performMigration, this->cmdQs[i]); + cmdList->executeCommandListImmediateImpl(performMigration, cmdQsForSplit[i]); } eventHandles.push_back(eventHandle); @@ -108,18 +119,18 @@ struct BcsSplit { engineCount--; } - cmdList->addEventsToCmdList(static_cast(this->cmdQs.size()), eventHandles.data(), hasRelaxedOrderingDependencies, false); - cmdList->appendEventForProfilingAllWalkers(this->events.marker[markerEventIndex], false, true); - + cmdList->addEventsToCmdList(static_cast(cmdQsForSplit.size()), eventHandles.data(), hasRelaxedOrderingDependencies, false); if (hSignalEvent) { cmdList->appendEventForProfilingAllWalkers(Event::fromHandle(hSignalEvent), false, true); } + cmdList->appendEventForProfilingAllWalkers(this->events.marker[markerEventIndex], false, true); return result; } bool setupDevice(uint32_t productFamily, bool internalUsage, const ze_command_queue_desc_t *desc, NEO::CommandStreamReceiver *csr); void releaseResources(); + std::vector &getCmdQsForSplit(NEO::TransferDirection direction); BcsSplit(DeviceImp &device) : device(device), events(*this){}; }; diff --git a/level_zero/core/test/unit_tests/xe_hpc_core/test_cmdqueue_xe_hpc_core.cpp b/level_zero/core/test/unit_tests/xe_hpc_core/test_cmdqueue_xe_hpc_core.cpp index efb9aaa4f2..5cb76819a2 100644 --- a/level_zero/core/test/unit_tests/xe_hpc_core/test_cmdqueue_xe_hpc_core.cpp +++ b/level_zero/core/test/unit_tests/xe_hpc_core/test_cmdqueue_xe_hpc_core.cpp @@ -429,7 +429,7 @@ HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhe context->freeMem(dstPtr); } -HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhenAppendingMemoryCopyFromDeviceToDeviceThenDoSplit, IsXeHpcCore) { +HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhenAppendingMemoryCopyFromDeviceToDeviceThenDoNotSplit, IsXeHpcCore) { using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW; DebugManagerStateRestore restorer; @@ -532,9 +532,9 @@ HWTEST2_F(CommandQueueCommandsXeHpc, givenFlushTaskSubmissionEnabledAndSplitBcsC EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[2])->getTaskCount(), 0u); EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[3])->getTaskCount(), 0u); EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[0])->getCsr()->peekTaskCount(), 1u); - EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[1])->getCsr()->peekTaskCount(), 1u); + EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[1])->getCsr()->peekTaskCount(), 0u); EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[2])->getCsr()->peekTaskCount(), 1u); - EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[3])->getCsr()->peekTaskCount(), 1u); + EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[3])->getCsr()->peekTaskCount(), 0u); EXPECT_FALSE(ultCsr->latestFlushedBatchBuffer.hasRelaxedOrderingDependencies); context->freeMem(srcPtr); @@ -594,9 +594,9 @@ HWTEST2_F(CommandQueueCommandsXeHpc, givenFlushTaskSubmissionEnabledAndSplitBcsC EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[2])->getTaskCount(), 0u); EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[3])->getTaskCount(), 0u); EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[0])->getCsr()->peekTaskCount(), 1u); - EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[1])->getCsr()->peekTaskCount(), 1u); + EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[1])->getCsr()->peekTaskCount(), 0u); EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[2])->getCsr()->peekTaskCount(), 1u); - EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[3])->getCsr()->peekTaskCount(), 1u); + EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[3])->getCsr()->peekTaskCount(), 0u); EXPECT_TRUE(ultCsr->latestFlushedBatchBuffer.hasRelaxedOrderingDependencies); context->freeMem(srcPtr); @@ -651,9 +651,9 @@ HWTEST2_F(CommandQueueCommandsXeHpc, givenRelaxedOrderingNotAllowedWhenDispatchS EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[2])->getTaskCount(), 0u); EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[3])->getTaskCount(), 0u); EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[0])->getCsr()->peekTaskCount(), 1u); - EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[1])->getCsr()->peekTaskCount(), 1u); + EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[1])->getCsr()->peekTaskCount(), 0u); EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[2])->getCsr()->peekTaskCount(), 1u); - EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[3])->getCsr()->peekTaskCount(), 1u); + EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[3])->getCsr()->peekTaskCount(), 0u); EXPECT_FALSE(ultCsr->latestFlushedBatchBuffer.hasRelaxedOrderingDependencies); uint32_t semaphoresFound = 0; @@ -666,13 +666,13 @@ HWTEST2_F(CommandQueueCommandsXeHpc, givenRelaxedOrderingNotAllowedWhenDispatchS } } - EXPECT_EQ(4u, semaphoresFound); + EXPECT_EQ(2u, semaphoresFound); context->freeMem(srcPtr); context->freeMem(dstPtr); } -HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhenAppendingMemoryCopyThenSuccessIsReturned, IsXeHpcCore) { +HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhenAppendingMemoryCopyD2HThenSuccessIsReturned, IsXeHpcCore) { using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW; DebugManagerStateRestore restorer; @@ -716,8 +716,60 @@ HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhe auto result = commandList0->appendMemoryCopy(dstPtr, srcPtr, size, nullptr, 0, nullptr, false); ASSERT_EQ(ZE_RESULT_SUCCESS, result); EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[0])->getTaskCount(), 1u); - EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[1])->getTaskCount(), 1u); + EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[1])->getTaskCount(), 0u); EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[2])->getTaskCount(), 1u); + EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[3])->getTaskCount(), 0u); + + context->freeMem(srcPtr); + context->freeMem(dstPtr); +} + +HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhenAppendingMemoryCopyH2DThenSuccessIsReturned, IsXeHpcCore) { + using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW; + + DebugManagerStateRestore restorer; + DebugManager.flags.SplitBcsCopy.set(1); + DebugManager.flags.EnableFlushTaskSubmission.set(0); + + ze_result_t returnValue; + auto hwInfo = *NEO::defaultHwInfo; + hwInfo.featureTable.ftrBcsInfo = 0b111111111; + hwInfo.capabilityTable.blitterOperationsSupported = true; + auto testNeoDevice = NEO::MockDevice::createWithNewExecutionEnvironment(&hwInfo); + auto testL0Device = std::unique_ptr(L0::Device::create(driverHandle.get(), testNeoDevice, false, &returnValue)); + + ze_command_queue_desc_t desc = {}; + desc.ordinal = static_cast(testNeoDevice->getEngineGroupIndexFromEngineGroupType(NEO::EngineGroupType::Copy)); + + std::unique_ptr commandList0(CommandList::createImmediate(productFamily, + testL0Device.get(), + &desc, + false, + NEO::EngineGroupType::Copy, + returnValue)); + ASSERT_NE(nullptr, commandList0); + EXPECT_EQ(static_cast(testL0Device.get())->bcsSplit.cmdQs.size(), 4u); + EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[0])->getTaskCount(), 0u); + EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[1])->getTaskCount(), 0u); + EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[2])->getTaskCount(), 0u); + EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[3])->getTaskCount(), 0u); + + constexpr size_t alignment = 4096u; + constexpr size_t size = 8 * MemoryConstants::megaByte; + void *srcPtr; + void *dstPtr; + ze_device_mem_alloc_desc_t deviceDesc = {}; + context->allocDeviceMem(device->toHandle(), + &deviceDesc, + size, alignment, &dstPtr); + ze_host_mem_alloc_desc_t hostDesc = {}; + context->allocHostMem(&hostDesc, size, alignment, &srcPtr); + + auto result = commandList0->appendMemoryCopy(dstPtr, srcPtr, size, nullptr, 0, nullptr, false); + ASSERT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[0])->getTaskCount(), 0u); + EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[1])->getTaskCount(), 1u); + EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[2])->getTaskCount(), 0u); EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[3])->getTaskCount(), 1u); context->freeMem(srcPtr); @@ -767,9 +819,9 @@ HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhe auto result = commandList0->appendMemoryCopyRegion(dstPtr, ®ion, 0, 0, srcPtr, ®ion, 0, 0, nullptr, 0, nullptr, false); ASSERT_EQ(ZE_RESULT_SUCCESS, result); EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[0])->getTaskCount(), 1u); - EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[1])->getTaskCount(), 1u); + EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[1])->getTaskCount(), 0u); EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[2])->getTaskCount(), 1u); - EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[3])->getTaskCount(), 1u); + EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[3])->getTaskCount(), 0u); context->freeMem(srcPtr); context->freeMem(dstPtr); @@ -831,9 +883,9 @@ HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhe auto result = commandList0->appendMemoryCopy(dstPtr, srcPtr, size, event->toHandle(), 0, nullptr, false); ASSERT_EQ(ZE_RESULT_SUCCESS, result); EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[0])->getTaskCount(), 1u); - EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[1])->getTaskCount(), 1u); + EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[1])->getTaskCount(), 0u); EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[2])->getTaskCount(), 1u); - EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[3])->getTaskCount(), 1u); + EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[3])->getTaskCount(), 0u); GenCmdList cmdList; ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList, commandList0->commandContainer.getCommandStream()->getCpuBase(), commandList0->commandContainer.getCommandStream()->getUsed())); @@ -891,9 +943,9 @@ HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhe auto result = commandList0->appendMemoryCopy(dstPtr, srcPtr, size, nullptr, 0, nullptr, false); ASSERT_EQ(ZE_RESULT_SUCCESS, result); EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[0])->getTaskCount(), 1u); - EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[1])->getTaskCount(), 1u); + EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[1])->getTaskCount(), 0u); EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[2])->getTaskCount(), 1u); - EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[3])->getTaskCount(), 1u); + EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[3])->getTaskCount(), 0u); GenCmdList cmdList; ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList, commandList0->commandContainer.getCommandStream()->getCpuBase(), commandList0->commandContainer.getCommandStream()->getUsed())); @@ -965,9 +1017,9 @@ HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhe auto result = commandList0->appendMemoryCopy(dstPtr, srcPtr, size, event->toHandle(), 0, nullptr, false); ASSERT_EQ(ZE_RESULT_SUCCESS, result); EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[0])->getTaskCount(), 1u); - EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[1])->getTaskCount(), 1u); + EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[1])->getTaskCount(), 0u); EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[2])->getTaskCount(), 1u); - EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[3])->getTaskCount(), 1u); + EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[3])->getTaskCount(), 0u); GenCmdList cmdList; ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList, commandList0->commandContainer.getCommandStream()->getCpuBase(), commandList0->commandContainer.getCommandStream()->getUsed())); @@ -1155,9 +1207,9 @@ HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhe false); ASSERT_EQ(ZE_RESULT_SUCCESS, result); EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[0])->getTaskCount(), 1u); - EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[1])->getTaskCount(), 1u); + EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[1])->getTaskCount(), 0u); EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[2])->getTaskCount(), 1u); - EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[3])->getTaskCount(), 1u); + EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[3])->getTaskCount(), 0u); context->freeMem(srcPtr); context->freeMem(dstPtr); diff --git a/opencl/source/command_queue/csr_selection_args.h b/opencl/source/command_queue/csr_selection_args.h index bf780217f2..f190cc1c85 100644 --- a/opencl/source/command_queue/csr_selection_args.h +++ b/opencl/source/command_queue/csr_selection_args.h @@ -6,6 +6,8 @@ */ #pragma once +#include "shared/source/command_stream/transfer_direction.h" + #include "opencl/source/api/cl_types.h" namespace NEO { @@ -14,13 +16,6 @@ class GraphicsAllocation; class Image; class Buffer; -enum class TransferDirection { - HostToHost, - HostToLocal, - LocalToHost, - LocalToLocal, -}; - struct CsrSelectionArgs { struct Resource { bool isLocal = false; @@ -70,22 +65,6 @@ struct CsrSelectionArgs { static void processResource(const MultiGraphicsAllocation &multiGfxAlloc, uint32_t rootDeviceIndex, Resource &outResource); static void processResource(const GraphicsAllocation &gfxAlloc, uint32_t rootDeviceIndex, Resource &outResource); - - static inline TransferDirection createTransferDirection(bool srcLocal, bool dstLocal) { - if (srcLocal) { - if (dstLocal) { - return TransferDirection::LocalToLocal; - } else { - return TransferDirection::LocalToHost; - } - } else { - if (dstLocal) { - return TransferDirection::HostToLocal; - } else { - return TransferDirection::HostToHost; - } - } - } }; } // namespace NEO diff --git a/shared/source/command_stream/CMakeLists.txt b/shared/source/command_stream/CMakeLists.txt index ac42ac43c3..bf4b8455a4 100644 --- a/shared/source/command_stream/CMakeLists.txt +++ b/shared/source/command_stream/CMakeLists.txt @@ -1,5 +1,5 @@ # -# Copyright (C) 2019-2022 Intel Corporation +# Copyright (C) 2019-2023 Intel Corporation # # SPDX-License-Identifier: MIT # @@ -60,6 +60,7 @@ set(NEO_CORE_COMMAND_STREAM ${CMAKE_CURRENT_SOURCE_DIR}/tbx_command_stream_receiver_hw.inl ${CMAKE_CURRENT_SOURCE_DIR}/tbx_stream.cpp ${CMAKE_CURRENT_SOURCE_DIR}/thread_arbitration_policy.h + ${CMAKE_CURRENT_SOURCE_DIR}/transfer_direction.h ${CMAKE_CURRENT_SOURCE_DIR}/wait_status.h ) diff --git a/shared/source/command_stream/transfer_direction.h b/shared/source/command_stream/transfer_direction.h new file mode 100644 index 0000000000..9343b7b976 --- /dev/null +++ b/shared/source/command_stream/transfer_direction.h @@ -0,0 +1,33 @@ +/* + * Copyright (C) 2023 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#pragma once + +namespace NEO { +enum class TransferDirection { + HostToHost, + HostToLocal, + LocalToHost, + LocalToLocal, +}; + +inline TransferDirection createTransferDirection(bool srcLocal, bool dstLocal) { + if (srcLocal) { + if (dstLocal) { + return TransferDirection::LocalToLocal; + } else { + return TransferDirection::LocalToHost; + } + } else { + if (dstLocal) { + return TransferDirection::HostToLocal; + } else { + return TransferDirection::HostToHost; + } + } +} +} // namespace NEO \ No newline at end of file diff --git a/shared/source/debug_settings/debug_variables_base.inl b/shared/source/debug_settings/debug_variables_base.inl index c6e7af879f..b5976f9d63 100644 --- a/shared/source/debug_settings/debug_variables_base.inl +++ b/shared/source/debug_settings/debug_variables_base.inl @@ -306,6 +306,8 @@ DECLARE_DEBUG_VARIABLE(int32_t, DeferCmdQBcsInitialization, -1, "-1: default, 0: DECLARE_DEBUG_VARIABLE(int32_t, PreferInternalBcsEngine, -1, "-1: default, 0:disabled, 1: enabled. When enabled use internal BCS engine for internal transfers, when disabled use regular engine") DECLARE_DEBUG_VARIABLE(int32_t, SplitBcsCopy, -1, "-1: default, 0:disabled, 1: enabled. When enqueues copy to main copy engine then split between even linked copy engines") DECLARE_DEBUG_VARIABLE(int32_t, SplitBcsMask, 0, "0: default, >0: bitmask: indicates bcs engines for split") +DECLARE_DEBUG_VARIABLE(int32_t, SplitBcsMaskH2D, 0, "0: default, >0: bitmask: indicates bcs engines for H2D split") +DECLARE_DEBUG_VARIABLE(int32_t, SplitBcsMaskD2H, 0, "0: default, >0: bitmask: indicates bcs engines for D2H split") DECLARE_DEBUG_VARIABLE(int32_t, ReuseKernelBinaries, -1, "-1: default, 0:disabled, 1: enabled. If enabled, driver reuses kernel binaries.") DECLARE_DEBUG_VARIABLE(int32_t, SetAmountOfReusableAllocations, -1, "-1: default, 0:disabled, > 1: enabled. If enabled, driver will fill reusable allocation lists with given amount of command buffers and heaps at initialization of immediate command list.") DECLARE_DEBUG_VARIABLE(int32_t, UseHighAlignmentForHeapExtended, -1, "-1: default, 0:disabled, > 1: enabled. If enabled, driver aligns HEAP_EXTENDED allocations to GPU VA that is next power of 2 for a given size, if disables GPU VA is using 2MB/64KB alignment.") diff --git a/shared/test/common/test_files/igdrcl.config b/shared/test/common/test_files/igdrcl.config index dbee767b47..dccd121cf6 100644 --- a/shared/test/common/test_files/igdrcl.config +++ b/shared/test/common/test_files/igdrcl.config @@ -418,6 +418,8 @@ DeferCmdQGpgpuInitialization = -1 DeferCmdQBcsInitialization = -1 SplitBcsCopy = -1 SplitBcsMask = 0 +SplitBcsMaskH2D = 0 +SplitBcsMaskD2H = 0 PreferInternalBcsEngine = -1 ReuseKernelBinaries = -1 EnableChipsetUniqueUUID = -1