fix: stall RelaxedOrdering scheduler when programming Semaphores
Related-To: NEO-7458 Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com>
This commit is contained in:
parent
87eb609958
commit
2383896dbb
|
@ -182,6 +182,7 @@ struct CommandListCoreFamilyImmediate : public CommandListCoreFamily<gfxCoreFami
|
|||
|
||||
void printKernelsPrintfOutput(bool hangDetected);
|
||||
ze_result_t synchronizeInOrderExecution(uint64_t timeout) const;
|
||||
bool hasStallingCmdsForRelaxedOrdering(uint32_t numWaitEvents, bool relaxedOrderingDispatch);
|
||||
|
||||
MOCKABLE_VIRTUAL void checkAssert();
|
||||
ComputeFlushMethodType computeFlushMethod = nullptr;
|
||||
|
|
|
@ -331,6 +331,11 @@ bool CommandListCoreFamilyImmediate<gfxCoreFamily>::waitForEventsFromHost() {
|
|||
return true;
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
bool CommandListCoreFamilyImmediate<gfxCoreFamily>::hasStallingCmdsForRelaxedOrdering(uint32_t numWaitEvents, bool relaxedOrderingDispatch) {
|
||||
return (!relaxedOrderingDispatch && (numWaitEvents > 0 || this->inOrderDependencyCounter > 0));
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendLaunchKernel(
|
||||
ze_kernel_handle_t kernelHandle, const ze_group_count_t *threadGroupDimensions,
|
||||
|
@ -354,7 +359,8 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendLaunchKernel(
|
|||
auto ret = CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernel(kernelHandle, threadGroupDimensions,
|
||||
hSignalEvent, numWaitEvents, phWaitEvents,
|
||||
launchParams, relaxedOrderingDispatch);
|
||||
return flushImmediate(ret, true, false, relaxedOrderingDispatch, hSignalEvent);
|
||||
|
||||
return flushImmediate(ret, true, hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch), relaxedOrderingDispatch, hSignalEvent);
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
|
@ -370,7 +376,8 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendLaunchKernelInd
|
|||
|
||||
auto ret = CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelIndirect(kernelHandle, pDispatchArgumentsBuffer,
|
||||
hSignalEvent, numWaitEvents, phWaitEvents, relaxedOrderingDispatch);
|
||||
return flushImmediate(ret, true, false, relaxedOrderingDispatch, hSignalEvent);
|
||||
|
||||
return flushImmediate(ret, true, hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch), relaxedOrderingDispatch, hSignalEvent);
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
|
@ -405,6 +412,8 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopy(
|
|||
checkWaitEventsState(numWaitEvents, phWaitEvents);
|
||||
}
|
||||
|
||||
bool hasStallindCmds = hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch);
|
||||
|
||||
ze_result_t ret;
|
||||
CpuMemCopyInfo cpuMemCopyInfo(dstptr, srcptr, size);
|
||||
this->device->getDriverHandle()->findAllocationDataForRange(const_cast<void *>(srcptr), size, &cpuMemCopyInfo.srcAllocData);
|
||||
|
@ -420,6 +429,8 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopy(
|
|||
auto isSplitNeeded = this->isAppendSplitNeeded(dstptr, srcptr, size, direction);
|
||||
if (isSplitNeeded) {
|
||||
relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(1); // split generates more than 1 event
|
||||
hasStallindCmds = !relaxedOrderingDispatch;
|
||||
|
||||
ret = static_cast<DeviceImp *>(this->device)->bcsSplit.appendSplitCall<gfxCoreFamily, void *, const void *>(this, dstptr, srcptr, size, hSignalEvent, numWaitEvents, phWaitEvents, true, relaxedOrderingDispatch, direction, [&](void *dstptrParam, const void *srcptrParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) {
|
||||
return CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(dstptrParam, srcptrParam, sizeParam, hSignalEventParam, 0u, nullptr, relaxedOrderingDispatch, true);
|
||||
});
|
||||
|
@ -427,7 +438,8 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopy(
|
|||
ret = CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(dstptr, srcptr, size, hSignalEvent,
|
||||
numWaitEvents, phWaitEvents, relaxedOrderingDispatch, forceDisableCopyOnlyInOrderSignaling);
|
||||
}
|
||||
return flushImmediate(ret, true, false, relaxedOrderingDispatch, hSignalEvent);
|
||||
|
||||
return flushImmediate(ret, true, hasStallindCmds, relaxedOrderingDispatch, hSignalEvent);
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
|
@ -450,12 +462,16 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopyRegio
|
|||
checkWaitEventsState(numWaitEvents, phWaitEvents);
|
||||
}
|
||||
|
||||
bool hasStallindCmds = hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch);
|
||||
|
||||
ze_result_t ret;
|
||||
|
||||
NEO::TransferDirection direction;
|
||||
auto isSplitNeeded = this->isAppendSplitNeeded(dstPtr, srcPtr, this->getTotalSizeForCopyRegion(dstRegion, dstPitch, dstSlicePitch), direction);
|
||||
if (isSplitNeeded) {
|
||||
relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(1); // split generates more than 1 event
|
||||
hasStallindCmds = !relaxedOrderingDispatch;
|
||||
|
||||
ret = static_cast<DeviceImp *>(this->device)->bcsSplit.appendSplitCall<gfxCoreFamily, uint32_t, uint32_t>(this, dstRegion->originX, srcRegion->originX, dstRegion->width, hSignalEvent, numWaitEvents, phWaitEvents, true, relaxedOrderingDispatch, direction, [&](uint32_t dstOriginXParam, uint32_t srcOriginXParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) {
|
||||
ze_copy_region_t dstRegionLocal = {};
|
||||
ze_copy_region_t srcRegionLocal = {};
|
||||
|
@ -475,7 +491,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopyRegio
|
|||
hSignalEvent, numWaitEvents, phWaitEvents, relaxedOrderingDispatch, forceDisableCopyOnlyInOrderSignaling);
|
||||
}
|
||||
|
||||
return flushImmediate(ret, true, false, relaxedOrderingDispatch, hSignalEvent);
|
||||
return flushImmediate(ret, true, hasStallindCmds, relaxedOrderingDispatch, hSignalEvent);
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
|
@ -493,7 +509,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryFill(void
|
|||
|
||||
auto ret = CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(ptr, pattern, patternSize, size, hSignalEvent, numWaitEvents, phWaitEvents, relaxedOrderingDispatch);
|
||||
|
||||
return flushImmediate(ret, true, false, relaxedOrderingDispatch, hSignalEvent);
|
||||
return flushImmediate(ret, true, hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch), relaxedOrderingDispatch, hSignalEvent);
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
|
@ -620,7 +636,8 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendImageCopyRegion
|
|||
|
||||
auto ret = CommandListCoreFamily<gfxCoreFamily>::appendImageCopyRegion(hDstImage, hSrcImage, pDstRegion, pSrcRegion, hSignalEvent,
|
||||
numWaitEvents, phWaitEvents, relaxedOrderingDispatch);
|
||||
return flushImmediate(ret, true, false, relaxedOrderingDispatch, hSignalEvent);
|
||||
|
||||
return flushImmediate(ret, true, hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch), relaxedOrderingDispatch, hSignalEvent);
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
|
@ -641,7 +658,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendImageCopyFromMe
|
|||
auto ret = CommandListCoreFamily<gfxCoreFamily>::appendImageCopyFromMemory(hDstImage, srcPtr, pDstRegion, hSignalEvent,
|
||||
numWaitEvents, phWaitEvents, relaxedOrderingDispatch);
|
||||
|
||||
return flushImmediate(ret, true, false, relaxedOrderingDispatch, hSignalEvent);
|
||||
return flushImmediate(ret, true, hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch), relaxedOrderingDispatch, hSignalEvent);
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
|
@ -662,7 +679,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendImageCopyToMemo
|
|||
auto ret = CommandListCoreFamily<gfxCoreFamily>::appendImageCopyToMemory(dstPtr, hSrcImage, pSrcRegion, hSignalEvent,
|
||||
numWaitEvents, phWaitEvents, relaxedOrderingDispatch);
|
||||
|
||||
return flushImmediate(ret, true, false, relaxedOrderingDispatch, hSignalEvent);
|
||||
return flushImmediate(ret, true, hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch), relaxedOrderingDispatch, hSignalEvent);
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
|
@ -694,7 +711,8 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendLaunchCooperati
|
|||
}
|
||||
|
||||
auto ret = CommandListCoreFamily<gfxCoreFamily>::appendLaunchCooperativeKernel(kernelHandle, launchKernelArgs, hSignalEvent, numWaitEvents, waitEventHandles, relaxedOrderingDispatch);
|
||||
return flushImmediate(ret, true, false, relaxedOrderingDispatch, hSignalEvent);
|
||||
|
||||
return flushImmediate(ret, true, hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch), relaxedOrderingDispatch, hSignalEvent);
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
*/
|
||||
|
||||
#include "shared/source/command_stream/wait_status.h"
|
||||
#include "shared/source/direct_submission/relaxed_ordering_helper.h"
|
||||
#include "shared/source/gmm_helper/gmm_helper.h"
|
||||
#include "shared/source/indirect_heap/indirect_heap.h"
|
||||
#include "shared/source/memory_manager/internal_allocation_storage.h"
|
||||
|
@ -1085,6 +1086,117 @@ HWTEST2_F(CommandListCreate, givenDirectSubmissionAndImmCmdListWhenDispatchingTh
|
|||
driverHandle->releaseImportedPointer(dstPtr);
|
||||
}
|
||||
|
||||
HWTEST2_F(CommandListCreate, givenDirectSubmissionAndImmCmdListWhenDispatchingDisabledRelaxedOrderingThenPassStallingCmdsInfo, IsAtLeastXeHpcCore) {
|
||||
ze_command_queue_desc_t desc = {};
|
||||
desc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
|
||||
ze_result_t returnValue;
|
||||
auto commandList = zeUniquePtr(CommandList::createImmediate(productFamily, device, &desc, false, NEO::EngineGroupType::RenderCompute, returnValue));
|
||||
ASSERT_NE(nullptr, commandList);
|
||||
auto whiteBoxCmdList = static_cast<CommandList *>(commandList.get());
|
||||
|
||||
ze_event_pool_desc_t eventPoolDesc = {};
|
||||
eventPoolDesc.count = 1;
|
||||
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE | ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
|
||||
|
||||
ze_event_desc_t eventDesc = {};
|
||||
eventDesc.wait = ZE_EVENT_SCOPE_FLAG_HOST;
|
||||
|
||||
ze_event_handle_t event = nullptr;
|
||||
|
||||
std::unique_ptr<L0::EventPool> eventPool(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, returnValue));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||
|
||||
ASSERT_EQ(ZE_RESULT_SUCCESS, eventPool->createEvent(&eventDesc, &event));
|
||||
std::unique_ptr<L0::Event> eventObject(L0::Event::fromHandle(event));
|
||||
|
||||
Mock<::L0::Kernel> kernel;
|
||||
ze_group_count_t groupCount{1, 1, 1};
|
||||
CmdListKernelLaunchParams launchParams = {};
|
||||
|
||||
uint8_t srcPtr[64] = {};
|
||||
uint8_t dstPtr[64] = {};
|
||||
const ze_copy_region_t region = {0U, 0U, 0U, 1, 1, 0U};
|
||||
|
||||
driverHandle->importExternalPointer(dstPtr, MemoryConstants::pageSize);
|
||||
|
||||
auto ultCsr = static_cast<NEO::UltCommandStreamReceiver<FamilyType> *>(whiteBoxCmdList->csr);
|
||||
ultCsr->recordFlusheBatchBuffer = true;
|
||||
ultCsr->unregisterClient();
|
||||
|
||||
EXPECT_FALSE(NEO::RelaxedOrderingHelper::isRelaxedOrderingDispatchAllowed(*ultCsr, 1));
|
||||
|
||||
auto verifyFlags = [&ultCsr](ze_result_t result) {
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_TRUE(ultCsr->recordedDispatchFlags.hasStallingCmds);
|
||||
EXPECT_TRUE(ultCsr->latestFlushedBatchBuffer.hasStallingCmds);
|
||||
};
|
||||
|
||||
auto resetFlags = [&ultCsr]() {
|
||||
ultCsr->recordedDispatchFlags.hasStallingCmds = false;
|
||||
ultCsr->latestFlushedBatchBuffer.hasStallingCmds = false;
|
||||
};
|
||||
|
||||
bool inOrderExecAlreadyEnabled = false;
|
||||
|
||||
for (bool inOrderExecution : {false, true}) {
|
||||
if (inOrderExecution && !inOrderExecAlreadyEnabled) {
|
||||
whiteBoxCmdList->enableInOrderExecution();
|
||||
inOrderExecAlreadyEnabled = true;
|
||||
}
|
||||
|
||||
EXPECT_EQ(inOrderExecAlreadyEnabled, inOrderExecution);
|
||||
|
||||
uint32_t numWaitEvents = inOrderExecution ? 0 : 1;
|
||||
ze_event_handle_t *waitlist = inOrderExecution ? nullptr : &event;
|
||||
|
||||
// non-pipelined state or first in-order exec
|
||||
resetFlags();
|
||||
verifyFlags(commandList->appendLaunchKernel(kernel.toHandle(), &groupCount, nullptr, 1, &event, launchParams, false));
|
||||
|
||||
// non-pipelined state already programmed
|
||||
resetFlags();
|
||||
verifyFlags(commandList->appendLaunchKernel(kernel.toHandle(), &groupCount, nullptr, numWaitEvents, waitlist, launchParams, false));
|
||||
|
||||
resetFlags();
|
||||
verifyFlags(commandList->appendLaunchKernelIndirect(kernel.toHandle(), &groupCount, nullptr, numWaitEvents, waitlist, false));
|
||||
|
||||
resetFlags();
|
||||
verifyFlags(commandList->appendMemoryCopy(dstPtr, srcPtr, 8, nullptr, numWaitEvents, waitlist, false, false));
|
||||
|
||||
resetFlags();
|
||||
verifyFlags(commandList->appendMemoryCopyRegion(dstPtr, ®ion, 0, 0, srcPtr, ®ion, 0, 0, nullptr, numWaitEvents, waitlist, false, false));
|
||||
|
||||
resetFlags();
|
||||
verifyFlags(commandList->appendMemoryFill(dstPtr, srcPtr, 8, 1, nullptr, numWaitEvents, waitlist, false));
|
||||
|
||||
if constexpr (FamilyType::supportsSampler) {
|
||||
auto kernel = device->getBuiltinFunctionsLib()->getImageFunction(ImageBuiltin::CopyImageRegion);
|
||||
auto mockBuiltinKernel = static_cast<Mock<::L0::Kernel> *>(kernel);
|
||||
mockBuiltinKernel->setArgRedescribedImageCallBase = false;
|
||||
|
||||
auto image = std::make_unique<WhiteBox<::L0::ImageCoreFamily<gfxCoreFamily>>>();
|
||||
ze_image_region_t imgRegion = {1, 1, 1, 1, 1, 1};
|
||||
ze_image_desc_t zeDesc = {};
|
||||
zeDesc.stype = ZE_STRUCTURE_TYPE_IMAGE_DESC;
|
||||
image->initialize(device, &zeDesc);
|
||||
|
||||
resetFlags();
|
||||
verifyFlags(commandList->appendImageCopyRegion(image->toHandle(), image->toHandle(), &imgRegion, &imgRegion, nullptr, numWaitEvents, waitlist, false));
|
||||
|
||||
resetFlags();
|
||||
verifyFlags(commandList->appendImageCopyFromMemory(image->toHandle(), dstPtr, &imgRegion, nullptr, numWaitEvents, waitlist, false));
|
||||
|
||||
resetFlags();
|
||||
verifyFlags(commandList->appendImageCopyToMemory(dstPtr, image->toHandle(), &imgRegion, nullptr, numWaitEvents, waitlist, false));
|
||||
}
|
||||
|
||||
resetFlags();
|
||||
verifyFlags(commandList->appendLaunchCooperativeKernel(kernel.toHandle(), &groupCount, nullptr, numWaitEvents, waitlist, false));
|
||||
}
|
||||
|
||||
driverHandle->releaseImportedPointer(dstPtr);
|
||||
}
|
||||
|
||||
HWTEST2_F(CommandListCreate, whenDispatchingThenPassNumCsrClients, IsAtLeastXeHpcCore) {
|
||||
ze_command_queue_desc_t desc = {};
|
||||
desc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS;
|
||||
|
|
Loading…
Reference in New Issue