diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl index 03ed09c933..b5f553679b 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl @@ -361,7 +361,8 @@ ze_result_t CommandListCoreFamilyImmediate::appendMemoryCopy( return performCpuMemcpy(dstptr, srcptr, size, dstAllocData, srcAllocData, hSignalEvent, numWaitEvents, phWaitEvents); } - if (this->isAppendSplitNeeded(dstptr, srcptr, size)) { + auto isSplitNeeded = this->isAppendSplitNeeded(dstptr, srcptr, size); + if (isSplitNeeded) { ret = static_cast(this->device)->bcsSplit.appendSplitCall(this, dstptr, srcptr, size, hSignalEvent, true, (numWaitEvents > 0), [&](void *dstptrParam, const void *srcptrParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) { return CommandListCoreFamily::appendMemoryCopy(dstptrParam, srcptrParam, sizeParam, hSignalEventParam, numWaitEvents, phWaitEvents); }); @@ -369,7 +370,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendMemoryCopy( ret = CommandListCoreFamily::appendMemoryCopy(dstptr, srcptr, size, hSignalEvent, numWaitEvents, phWaitEvents); } - return flushImmediate(ret, true, false, (numWaitEvents > 0), hSignalEvent); + return flushImmediate(ret, true, false, (numWaitEvents > 0) || isSplitNeeded, hSignalEvent); } template @@ -393,7 +394,8 @@ ze_result_t CommandListCoreFamilyImmediate::appendMemoryCopyRegio ze_result_t ret; - if (this->isAppendSplitNeeded(dstPtr, srcPtr, this->getTotalSizeForCopyRegion(dstRegion, dstPitch, dstSlicePitch))) { + auto isSplitNeeded = this->isAppendSplitNeeded(dstPtr, srcPtr, this->getTotalSizeForCopyRegion(dstRegion, dstPitch, dstSlicePitch)); + if (isSplitNeeded) { ret = static_cast(this->device)->bcsSplit.appendSplitCall(this, dstRegion->originX, srcRegion->originX, dstRegion->width, hSignalEvent, true, (numWaitEvents > 0), [&](uint32_t dstOriginXParam, uint32_t srcOriginXParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) { ze_copy_region_t dstRegionLocal = {}; ze_copy_region_t srcRegionLocal = {}; @@ -413,7 +415,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendMemoryCopyRegio hSignalEvent, numWaitEvents, phWaitEvents); } - return flushImmediate(ret, true, false, (numWaitEvents > 0), hSignalEvent); + return flushImmediate(ret, true, false, (numWaitEvents > 0) || isSplitNeeded, hSignalEvent); } template @@ -467,7 +469,8 @@ ze_result_t CommandListCoreFamilyImmediate::appendPageFaultCopy(N ze_result_t ret; - if (this->isAppendSplitNeeded(dstAllocation->getMemoryPool(), srcAllocation->getMemoryPool(), size)) { + auto isSplitNeeded = this->isAppendSplitNeeded(dstAllocation->getMemoryPool(), srcAllocation->getMemoryPool(), size); + if (isSplitNeeded) { uintptr_t dstAddress = static_cast(dstAllocation->getGpuAddress()); uintptr_t srcAddress = static_cast(srcAllocation->getGpuAddress()); ret = static_cast(this->device)->bcsSplit.appendSplitCall(this, dstAddress, srcAddress, size, nullptr, false, false, [&](uintptr_t dstAddressParam, uintptr_t srcAddressParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) { @@ -479,7 +482,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendPageFaultCopy(N } else { ret = CommandListCoreFamily::appendPageFaultCopy(dstAllocation, srcAllocation, size, flushHost); } - return flushImmediate(ret, false, false, false, nullptr); + return flushImmediate(ret, false, false, isSplitNeeded, nullptr); } template diff --git a/level_zero/core/test/unit_tests/xe_hpc_core/test_cmdqueue_xe_hpc_core.cpp b/level_zero/core/test/unit_tests/xe_hpc_core/test_cmdqueue_xe_hpc_core.cpp index af50d8f02e..138a98cdad 100644 --- a/level_zero/core/test/unit_tests/xe_hpc_core/test_cmdqueue_xe_hpc_core.cpp +++ b/level_zero/core/test/unit_tests/xe_hpc_core/test_cmdqueue_xe_hpc_core.cpp @@ -8,6 +8,7 @@ #include "shared/source/os_interface/hw_info_config.h" #include "shared/test/common/cmd_parse/gen_cmd_parse.h" #include "shared/test/common/helpers/default_hw_info.h" +#include "shared/test/common/libult/ult_command_stream_receiver.h" #include "shared/test/common/mocks/mock_command_stream_receiver.h" #include "shared/test/common/test_macros/hw_test.h" @@ -472,6 +473,8 @@ HWTEST2_F(CommandQueueCommandsXeHpc, givenFlushTaskSubmissionEnabledAndSplitBcsC size, alignment, &srcPtr); ze_host_mem_alloc_desc_t hostDesc = {}; context->allocHostMem(&hostDesc, size, alignment, &dstPtr); + auto ultCsr = static_cast *>(commandList0->csr); + ultCsr->recordFlusheBatchBuffer = true; auto result = commandList0->appendMemoryCopy(dstPtr, srcPtr, size, nullptr, 0, nullptr); ASSERT_EQ(ZE_RESULT_SUCCESS, result); @@ -483,6 +486,7 @@ HWTEST2_F(CommandQueueCommandsXeHpc, givenFlushTaskSubmissionEnabledAndSplitBcsC EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[1])->getCsr()->peekTaskCount(), 1u); EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[2])->getCsr()->peekTaskCount(), 1u); EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[3])->getCsr()->peekTaskCount(), 1u); + EXPECT_TRUE(ultCsr->latestFlushedBatchBuffer.hasRelaxedOrderingDependencies); context->freeMem(srcPtr); context->freeMem(dstPtr);