diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.h b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.h index 812528d41f..e455287f87 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.h @@ -182,6 +182,7 @@ struct CommandListCoreFamilyImmediate : public CommandListCoreFamily::waitForEventsFromHost() { return true; } +template +bool CommandListCoreFamilyImmediate::hasStallingCmdsForRelaxedOrdering(uint32_t numWaitEvents, bool relaxedOrderingDispatch) { + return (!relaxedOrderingDispatch && (numWaitEvents > 0 || this->inOrderDependencyCounter > 0)); +} + template ze_result_t CommandListCoreFamilyImmediate::appendLaunchKernel( ze_kernel_handle_t kernelHandle, const ze_group_count_t *threadGroupDimensions, @@ -354,7 +359,8 @@ ze_result_t CommandListCoreFamilyImmediate::appendLaunchKernel( auto ret = CommandListCoreFamily::appendLaunchKernel(kernelHandle, threadGroupDimensions, hSignalEvent, numWaitEvents, phWaitEvents, launchParams, relaxedOrderingDispatch); - return flushImmediate(ret, true, false, relaxedOrderingDispatch, hSignalEvent); + + return flushImmediate(ret, true, hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch), relaxedOrderingDispatch, hSignalEvent); } template @@ -370,7 +376,8 @@ ze_result_t CommandListCoreFamilyImmediate::appendLaunchKernelInd auto ret = CommandListCoreFamily::appendLaunchKernelIndirect(kernelHandle, pDispatchArgumentsBuffer, hSignalEvent, numWaitEvents, phWaitEvents, relaxedOrderingDispatch); - return flushImmediate(ret, true, false, relaxedOrderingDispatch, hSignalEvent); + + return flushImmediate(ret, true, hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch), relaxedOrderingDispatch, hSignalEvent); } template @@ -405,6 +412,8 @@ ze_result_t CommandListCoreFamilyImmediate::appendMemoryCopy( checkWaitEventsState(numWaitEvents, phWaitEvents); } + bool hasStallindCmds = hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch); + ze_result_t ret; CpuMemCopyInfo cpuMemCopyInfo(dstptr, srcptr, size); this->device->getDriverHandle()->findAllocationDataForRange(const_cast(srcptr), size, &cpuMemCopyInfo.srcAllocData); @@ -420,6 +429,8 @@ ze_result_t CommandListCoreFamilyImmediate::appendMemoryCopy( auto isSplitNeeded = this->isAppendSplitNeeded(dstptr, srcptr, size, direction); if (isSplitNeeded) { relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(1); // split generates more than 1 event + hasStallindCmds = !relaxedOrderingDispatch; + ret = static_cast(this->device)->bcsSplit.appendSplitCall(this, dstptr, srcptr, size, hSignalEvent, numWaitEvents, phWaitEvents, true, relaxedOrderingDispatch, direction, [&](void *dstptrParam, const void *srcptrParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) { return CommandListCoreFamily::appendMemoryCopy(dstptrParam, srcptrParam, sizeParam, hSignalEventParam, 0u, nullptr, relaxedOrderingDispatch, true); }); @@ -427,7 +438,8 @@ ze_result_t CommandListCoreFamilyImmediate::appendMemoryCopy( ret = CommandListCoreFamily::appendMemoryCopy(dstptr, srcptr, size, hSignalEvent, numWaitEvents, phWaitEvents, relaxedOrderingDispatch, forceDisableCopyOnlyInOrderSignaling); } - return flushImmediate(ret, true, false, relaxedOrderingDispatch, hSignalEvent); + + return flushImmediate(ret, true, hasStallindCmds, relaxedOrderingDispatch, hSignalEvent); } template @@ -450,12 +462,16 @@ ze_result_t CommandListCoreFamilyImmediate::appendMemoryCopyRegio checkWaitEventsState(numWaitEvents, phWaitEvents); } + bool hasStallindCmds = hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch); + ze_result_t ret; NEO::TransferDirection direction; auto isSplitNeeded = this->isAppendSplitNeeded(dstPtr, srcPtr, this->getTotalSizeForCopyRegion(dstRegion, dstPitch, dstSlicePitch), direction); if (isSplitNeeded) { relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(1); // split generates more than 1 event + hasStallindCmds = !relaxedOrderingDispatch; + ret = static_cast(this->device)->bcsSplit.appendSplitCall(this, dstRegion->originX, srcRegion->originX, dstRegion->width, hSignalEvent, numWaitEvents, phWaitEvents, true, relaxedOrderingDispatch, direction, [&](uint32_t dstOriginXParam, uint32_t srcOriginXParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) { ze_copy_region_t dstRegionLocal = {}; ze_copy_region_t srcRegionLocal = {}; @@ -475,7 +491,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendMemoryCopyRegio hSignalEvent, numWaitEvents, phWaitEvents, relaxedOrderingDispatch, forceDisableCopyOnlyInOrderSignaling); } - return flushImmediate(ret, true, false, relaxedOrderingDispatch, hSignalEvent); + return flushImmediate(ret, true, hasStallindCmds, relaxedOrderingDispatch, hSignalEvent); } template @@ -493,7 +509,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendMemoryFill(void auto ret = CommandListCoreFamily::appendMemoryFill(ptr, pattern, patternSize, size, hSignalEvent, numWaitEvents, phWaitEvents, relaxedOrderingDispatch); - return flushImmediate(ret, true, false, relaxedOrderingDispatch, hSignalEvent); + return flushImmediate(ret, true, hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch), relaxedOrderingDispatch, hSignalEvent); } template @@ -620,7 +636,8 @@ ze_result_t CommandListCoreFamilyImmediate::appendImageCopyRegion auto ret = CommandListCoreFamily::appendImageCopyRegion(hDstImage, hSrcImage, pDstRegion, pSrcRegion, hSignalEvent, numWaitEvents, phWaitEvents, relaxedOrderingDispatch); - return flushImmediate(ret, true, false, relaxedOrderingDispatch, hSignalEvent); + + return flushImmediate(ret, true, hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch), relaxedOrderingDispatch, hSignalEvent); } template @@ -641,7 +658,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendImageCopyFromMe auto ret = CommandListCoreFamily::appendImageCopyFromMemory(hDstImage, srcPtr, pDstRegion, hSignalEvent, numWaitEvents, phWaitEvents, relaxedOrderingDispatch); - return flushImmediate(ret, true, false, relaxedOrderingDispatch, hSignalEvent); + return flushImmediate(ret, true, hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch), relaxedOrderingDispatch, hSignalEvent); } template @@ -662,7 +679,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendImageCopyToMemo auto ret = CommandListCoreFamily::appendImageCopyToMemory(dstPtr, hSrcImage, pSrcRegion, hSignalEvent, numWaitEvents, phWaitEvents, relaxedOrderingDispatch); - return flushImmediate(ret, true, false, relaxedOrderingDispatch, hSignalEvent); + return flushImmediate(ret, true, hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch), relaxedOrderingDispatch, hSignalEvent); } template @@ -694,7 +711,8 @@ ze_result_t CommandListCoreFamilyImmediate::appendLaunchCooperati } auto ret = CommandListCoreFamily::appendLaunchCooperativeKernel(kernelHandle, launchKernelArgs, hSignalEvent, numWaitEvents, waitEventHandles, relaxedOrderingDispatch); - return flushImmediate(ret, true, false, relaxedOrderingDispatch, hSignalEvent); + + return flushImmediate(ret, true, hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch), relaxedOrderingDispatch, hSignalEvent); } template diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp index bffe52aebe..dae17c1a21 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp @@ -6,6 +6,7 @@ */ #include "shared/source/command_stream/wait_status.h" +#include "shared/source/direct_submission/relaxed_ordering_helper.h" #include "shared/source/gmm_helper/gmm_helper.h" #include "shared/source/indirect_heap/indirect_heap.h" #include "shared/source/memory_manager/internal_allocation_storage.h" @@ -1085,6 +1086,117 @@ HWTEST2_F(CommandListCreate, givenDirectSubmissionAndImmCmdListWhenDispatchingTh driverHandle->releaseImportedPointer(dstPtr); } +HWTEST2_F(CommandListCreate, givenDirectSubmissionAndImmCmdListWhenDispatchingDisabledRelaxedOrderingThenPassStallingCmdsInfo, IsAtLeastXeHpcCore) { + ze_command_queue_desc_t desc = {}; + desc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS; + ze_result_t returnValue; + auto commandList = zeUniquePtr(CommandList::createImmediate(productFamily, device, &desc, false, NEO::EngineGroupType::RenderCompute, returnValue)); + ASSERT_NE(nullptr, commandList); + auto whiteBoxCmdList = static_cast(commandList.get()); + + ze_event_pool_desc_t eventPoolDesc = {}; + eventPoolDesc.count = 1; + eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE | ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + + ze_event_desc_t eventDesc = {}; + eventDesc.wait = ZE_EVENT_SCOPE_FLAG_HOST; + + ze_event_handle_t event = nullptr; + + std::unique_ptr eventPool(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, returnValue)); + EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue); + + ASSERT_EQ(ZE_RESULT_SUCCESS, eventPool->createEvent(&eventDesc, &event)); + std::unique_ptr eventObject(L0::Event::fromHandle(event)); + + Mock<::L0::Kernel> kernel; + ze_group_count_t groupCount{1, 1, 1}; + CmdListKernelLaunchParams launchParams = {}; + + uint8_t srcPtr[64] = {}; + uint8_t dstPtr[64] = {}; + const ze_copy_region_t region = {0U, 0U, 0U, 1, 1, 0U}; + + driverHandle->importExternalPointer(dstPtr, MemoryConstants::pageSize); + + auto ultCsr = static_cast *>(whiteBoxCmdList->csr); + ultCsr->recordFlusheBatchBuffer = true; + ultCsr->unregisterClient(); + + EXPECT_FALSE(NEO::RelaxedOrderingHelper::isRelaxedOrderingDispatchAllowed(*ultCsr, 1)); + + auto verifyFlags = [&ultCsr](ze_result_t result) { + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_TRUE(ultCsr->recordedDispatchFlags.hasStallingCmds); + EXPECT_TRUE(ultCsr->latestFlushedBatchBuffer.hasStallingCmds); + }; + + auto resetFlags = [&ultCsr]() { + ultCsr->recordedDispatchFlags.hasStallingCmds = false; + ultCsr->latestFlushedBatchBuffer.hasStallingCmds = false; + }; + + bool inOrderExecAlreadyEnabled = false; + + for (bool inOrderExecution : {false, true}) { + if (inOrderExecution && !inOrderExecAlreadyEnabled) { + whiteBoxCmdList->enableInOrderExecution(); + inOrderExecAlreadyEnabled = true; + } + + EXPECT_EQ(inOrderExecAlreadyEnabled, inOrderExecution); + + uint32_t numWaitEvents = inOrderExecution ? 0 : 1; + ze_event_handle_t *waitlist = inOrderExecution ? nullptr : &event; + + // non-pipelined state or first in-order exec + resetFlags(); + verifyFlags(commandList->appendLaunchKernel(kernel.toHandle(), &groupCount, nullptr, 1, &event, launchParams, false)); + + // non-pipelined state already programmed + resetFlags(); + verifyFlags(commandList->appendLaunchKernel(kernel.toHandle(), &groupCount, nullptr, numWaitEvents, waitlist, launchParams, false)); + + resetFlags(); + verifyFlags(commandList->appendLaunchKernelIndirect(kernel.toHandle(), &groupCount, nullptr, numWaitEvents, waitlist, false)); + + resetFlags(); + verifyFlags(commandList->appendMemoryCopy(dstPtr, srcPtr, 8, nullptr, numWaitEvents, waitlist, false, false)); + + resetFlags(); + verifyFlags(commandList->appendMemoryCopyRegion(dstPtr, ®ion, 0, 0, srcPtr, ®ion, 0, 0, nullptr, numWaitEvents, waitlist, false, false)); + + resetFlags(); + verifyFlags(commandList->appendMemoryFill(dstPtr, srcPtr, 8, 1, nullptr, numWaitEvents, waitlist, false)); + + if constexpr (FamilyType::supportsSampler) { + auto kernel = device->getBuiltinFunctionsLib()->getImageFunction(ImageBuiltin::CopyImageRegion); + auto mockBuiltinKernel = static_cast *>(kernel); + mockBuiltinKernel->setArgRedescribedImageCallBase = false; + + auto image = std::make_unique>>(); + ze_image_region_t imgRegion = {1, 1, 1, 1, 1, 1}; + ze_image_desc_t zeDesc = {}; + zeDesc.stype = ZE_STRUCTURE_TYPE_IMAGE_DESC; + image->initialize(device, &zeDesc); + + resetFlags(); + verifyFlags(commandList->appendImageCopyRegion(image->toHandle(), image->toHandle(), &imgRegion, &imgRegion, nullptr, numWaitEvents, waitlist, false)); + + resetFlags(); + verifyFlags(commandList->appendImageCopyFromMemory(image->toHandle(), dstPtr, &imgRegion, nullptr, numWaitEvents, waitlist, false)); + + resetFlags(); + verifyFlags(commandList->appendImageCopyToMemory(dstPtr, image->toHandle(), &imgRegion, nullptr, numWaitEvents, waitlist, false)); + } + + resetFlags(); + verifyFlags(commandList->appendLaunchCooperativeKernel(kernel.toHandle(), &groupCount, nullptr, numWaitEvents, waitlist, false)); + } + + driverHandle->releaseImportedPointer(dstPtr); +} + HWTEST2_F(CommandListCreate, whenDispatchingThenPassNumCsrClients, IsAtLeastXeHpcCore) { ze_command_queue_desc_t desc = {}; desc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS;