diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.h b/level_zero/core/source/cmdlist/cmdlist_hw.h index ed1a845db8..4ecadde896 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw.h @@ -243,6 +243,7 @@ struct CommandListCoreFamily : CommandListImp { void updateStreamProperties(Kernel &kernel, bool isCooperative); void clearCommandsToPatch(); + size_t getTotalSizeForCopyRegion(const ze_copy_region_t *region, uint32_t pitch, uint32_t slicePitch); bool isAppendSplitNeeded(void *dstPtr, const void *srcPtr, size_t size); void applyMemoryRangesBarrier(uint32_t numRanges, const size_t *pRangeSizes, diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index e64aed746c..8e6b24f42e 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -1269,20 +1269,8 @@ ze_result_t CommandListCoreFamily::appendMemoryCopyRegion(void *d callId = neoDevice->getRootDeviceEnvironment().tagsManager->currentCallCount; } - size_t dstSize = 0; - size_t srcSize = 0; - - if (srcRegion->depth > 1) { - uint32_t hostPtrDstOffset = dstRegion->originX + ((dstRegion->originY) * dstPitch) + ((dstRegion->originZ) * dstSlicePitch); - uint32_t hostPtrSrcOffset = srcRegion->originX + ((srcRegion->originY) * srcPitch) + ((srcRegion->originZ) * srcSlicePitch); - dstSize = (dstRegion->width * dstRegion->height * dstRegion->depth) + hostPtrDstOffset; - srcSize = (srcRegion->width * srcRegion->height * srcRegion->depth) + hostPtrSrcOffset; - } else { - uint32_t hostPtrDstOffset = dstRegion->originX + ((dstRegion->originY) * dstPitch); - uint32_t hostPtrSrcOffset = srcRegion->originX + ((srcRegion->originY) * srcPitch); - dstSize = (dstRegion->width * dstRegion->height) + hostPtrDstOffset; - srcSize = (srcRegion->width * srcRegion->height) + hostPtrSrcOffset; - } + size_t dstSize = this->getTotalSizeForCopyRegion(dstRegion, dstPitch, dstSlicePitch); + size_t srcSize = this->getTotalSizeForCopyRegion(srcRegion, srcPitch, srcSlicePitch); auto dstAllocationStruct = getAlignedAllocation(this->device, dstPtr, dstSize, false); auto srcAllocationStruct = getAlignedAllocation(this->device, srcPtr, srcSize, true); @@ -2386,6 +2374,17 @@ void CommandListCoreFamily::clearCommandsToPatch() { commandsToPatch.clear(); } +template +inline size_t CommandListCoreFamily::getTotalSizeForCopyRegion(const ze_copy_region_t *region, uint32_t pitch, uint32_t slicePitch) { + if (region->depth > 1) { + uint32_t offset = region->originX + ((region->originY) * pitch) + ((region->originZ) * slicePitch); + return (region->width * region->height * region->depth) + offset; + } else { + uint32_t offset = region->originX + ((region->originY) * pitch); + return (region->width * region->height) + offset; + } +} + template bool CommandListCoreFamily::isAppendSplitNeeded(void *dstPtr, const void *srcPtr, size_t size) { constexpr size_t minimalSizeForBcsSplit = 4 * MemoryConstants::megaByte; diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl index e754e5235e..6c26a2cc43 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl @@ -220,7 +220,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendMemoryCopy( ze_result_t ret; if (this->isAppendSplitNeeded(dstptr, srcptr, size)) { - ret = static_cast(this->device)->bcsSplit.appendSplitCall(this, dstptr, srcptr, size, hSignalEvent, [&](void *dstptrParam, const void *srcptrParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) { + ret = static_cast(this->device)->bcsSplit.appendSplitCall(this, dstptr, srcptr, size, hSignalEvent, [&](void *dstptrParam, const void *srcptrParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) { return CommandListCoreFamily::appendMemoryCopy(dstptrParam, srcptrParam, sizeParam, hSignalEventParam, numWaitEvents, phWaitEvents); }); } else { @@ -248,9 +248,28 @@ ze_result_t CommandListCoreFamilyImmediate::appendMemoryCopyRegio if (this->isFlushTaskSubmissionEnabled) { checkAvailableSpace(); } - auto ret = CommandListCoreFamily::appendMemoryCopyRegion(dstPtr, dstRegion, dstPitch, dstSlicePitch, - srcPtr, srcRegion, srcPitch, srcSlicePitch, - hSignalEvent, numWaitEvents, phWaitEvents); + + ze_result_t ret; + + if (this->isAppendSplitNeeded(dstPtr, srcPtr, this->getTotalSizeForCopyRegion(dstRegion, dstPitch, dstSlicePitch))) { + ret = static_cast(this->device)->bcsSplit.appendSplitCall(this, dstRegion->originX, srcRegion->originX, dstRegion->width, hSignalEvent, [&](uint32_t dstOriginXParam, uint32_t srcOriginXParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) { + ze_copy_region_t dstRegionLocal = {}; + ze_copy_region_t srcRegionLocal = {}; + memcpy(&dstRegionLocal, dstRegion, sizeof(ze_copy_region_t)); + memcpy(&srcRegionLocal, srcRegion, sizeof(ze_copy_region_t)); + dstRegionLocal.originX = dstOriginXParam; + dstRegionLocal.width = static_cast(sizeParam); + srcRegionLocal.originX = srcOriginXParam; + srcRegionLocal.width = static_cast(sizeParam); + return CommandListCoreFamily::appendMemoryCopyRegion(dstPtr, &dstRegionLocal, dstPitch, dstSlicePitch, + srcPtr, &srcRegionLocal, srcPitch, srcSlicePitch, + hSignalEventParam, numWaitEvents, phWaitEvents); + }); + } else { + ret = CommandListCoreFamily::appendMemoryCopyRegion(dstPtr, dstRegion, dstPitch, dstSlicePitch, + srcPtr, srcRegion, srcPitch, srcSlicePitch, + hSignalEvent, numWaitEvents, phWaitEvents); + } return flushImmediate(ret, true); } diff --git a/level_zero/core/source/device/bcs_split.h b/level_zero/core/source/device/bcs_split.h index 12b31b1f8d..a7e59b228e 100644 --- a/level_zero/core/source/device/bcs_split.h +++ b/level_zero/core/source/device/bcs_split.h @@ -47,13 +47,13 @@ struct BcsSplit { std::vector cmdQs; NEO::BcsInfoMask engines = NEO::EngineHelpers::oddLinkedCopyEnginesMask; - template + template ze_result_t appendSplitCall(CommandListCoreFamilyImmediate *cmdList, - void *dstptr, - const void *srcptr, + T dstptr, + K srcptr, size_t size, ze_event_handle_t hSignalEvent, - std::function appendCall) { + std::function appendCall) { ze_result_t result = ZE_RESULT_SUCCESS; if (hSignalEvent) { diff --git a/level_zero/core/test/unit_tests/xe_hpc_core/test_cmdqueue_xe_hpc_core.cpp b/level_zero/core/test/unit_tests/xe_hpc_core/test_cmdqueue_xe_hpc_core.cpp index 4dae832eae..5b002ff0de 100644 --- a/level_zero/core/test/unit_tests/xe_hpc_core/test_cmdqueue_xe_hpc_core.cpp +++ b/level_zero/core/test/unit_tests/xe_hpc_core/test_cmdqueue_xe_hpc_core.cpp @@ -467,6 +467,56 @@ HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhe context->freeMem(dstPtr); } +HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhenAppendingMemoryCopyRegionThenSuccessIsReturned, IsXeHpcCore) { + DebugManagerStateRestore restorer; + DebugManager.flags.SplitBcsCopy.set(1); + + ze_result_t returnValue; + auto hwInfo = *NEO::defaultHwInfo; + hwInfo.featureTable.ftrBcsInfo = 0b111111111; + hwInfo.capabilityTable.blitterOperationsSupported = true; + auto testNeoDevice = NEO::MockDevice::createWithNewExecutionEnvironment(&hwInfo); + auto testL0Device = std::unique_ptr(L0::Device::create(driverHandle.get(), testNeoDevice, false, &returnValue)); + + ze_command_queue_desc_t desc = {}; + desc.ordinal = static_cast(testNeoDevice->getEngineGroupIndexFromEngineGroupType(NEO::EngineGroupType::Copy)); + + std::unique_ptr commandList0(CommandList::createImmediate(productFamily, + testL0Device.get(), + &desc, + false, + NEO::EngineGroupType::Copy, + returnValue)); + ASSERT_NE(nullptr, commandList0); + EXPECT_EQ(static_cast(testL0Device.get())->bcsSplit.cmdQs.size(), 4u); + EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[0])->getTaskCount(), 0u); + EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[1])->getTaskCount(), 0u); + EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[2])->getTaskCount(), 0u); + EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[3])->getTaskCount(), 0u); + + constexpr size_t alignment = 4096u; + constexpr size_t size = 8 * MemoryConstants::megaByte; + void *srcPtr; + void *dstPtr; + ze_device_mem_alloc_desc_t deviceDesc = {}; + context->allocDeviceMem(device->toHandle(), + &deviceDesc, + size, alignment, &srcPtr); + ze_host_mem_alloc_desc_t hostDesc = {}; + context->allocHostMem(&hostDesc, size, alignment, &dstPtr); + ze_copy_region_t region = {2, 1, 1, 4 * MemoryConstants::megaByte, 1, 1}; + + auto result = commandList0->appendMemoryCopyRegion(dstPtr, ®ion, 0, 0, srcPtr, ®ion, 0, 0, nullptr, 0, nullptr); + ASSERT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[0])->getTaskCount(), 1u); + EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[1])->getTaskCount(), 1u); + EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[2])->getTaskCount(), 1u); + EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[3])->getTaskCount(), 1u); + + context->freeMem(srcPtr); + context->freeMem(dstPtr); +} + HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhenAppendingMemoryCopyWithEventThenSuccessIsReturnedAndMiFlushProgrammed, IsXeHpcCore) { using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW;