fix: stall RelaxedOrdering scheduler when programming Semaphores

Related-To: NEO-7458

Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com>
This commit is contained in:
Dunajski, Bartosz 2023-06-27 15:48:18 +00:00 committed by Compute-Runtime-Automation
parent 87eb609958
commit 2383896dbb
3 changed files with 140 additions and 9 deletions

View File

@ -182,6 +182,7 @@ struct CommandListCoreFamilyImmediate : public CommandListCoreFamily<gfxCoreFami
void printKernelsPrintfOutput(bool hangDetected);
ze_result_t synchronizeInOrderExecution(uint64_t timeout) const;
bool hasStallingCmdsForRelaxedOrdering(uint32_t numWaitEvents, bool relaxedOrderingDispatch);
MOCKABLE_VIRTUAL void checkAssert();
ComputeFlushMethodType computeFlushMethod = nullptr;

View File

@ -331,6 +331,11 @@ bool CommandListCoreFamilyImmediate<gfxCoreFamily>::waitForEventsFromHost() {
return true;
}
template <GFXCORE_FAMILY gfxCoreFamily>
bool CommandListCoreFamilyImmediate<gfxCoreFamily>::hasStallingCmdsForRelaxedOrdering(uint32_t numWaitEvents, bool relaxedOrderingDispatch) {
return (!relaxedOrderingDispatch && (numWaitEvents > 0 || this->inOrderDependencyCounter > 0));
}
template <GFXCORE_FAMILY gfxCoreFamily>
ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendLaunchKernel(
ze_kernel_handle_t kernelHandle, const ze_group_count_t *threadGroupDimensions,
@ -354,7 +359,8 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendLaunchKernel(
auto ret = CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernel(kernelHandle, threadGroupDimensions,
hSignalEvent, numWaitEvents, phWaitEvents,
launchParams, relaxedOrderingDispatch);
return flushImmediate(ret, true, false, relaxedOrderingDispatch, hSignalEvent);
return flushImmediate(ret, true, hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch), relaxedOrderingDispatch, hSignalEvent);
}
template <GFXCORE_FAMILY gfxCoreFamily>
@ -370,7 +376,8 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendLaunchKernelInd
auto ret = CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelIndirect(kernelHandle, pDispatchArgumentsBuffer,
hSignalEvent, numWaitEvents, phWaitEvents, relaxedOrderingDispatch);
return flushImmediate(ret, true, false, relaxedOrderingDispatch, hSignalEvent);
return flushImmediate(ret, true, hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch), relaxedOrderingDispatch, hSignalEvent);
}
template <GFXCORE_FAMILY gfxCoreFamily>
@ -405,6 +412,8 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopy(
checkWaitEventsState(numWaitEvents, phWaitEvents);
}
bool hasStallindCmds = hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch);
ze_result_t ret;
CpuMemCopyInfo cpuMemCopyInfo(dstptr, srcptr, size);
this->device->getDriverHandle()->findAllocationDataForRange(const_cast<void *>(srcptr), size, &cpuMemCopyInfo.srcAllocData);
@ -420,6 +429,8 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopy(
auto isSplitNeeded = this->isAppendSplitNeeded(dstptr, srcptr, size, direction);
if (isSplitNeeded) {
relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(1); // split generates more than 1 event
hasStallindCmds = !relaxedOrderingDispatch;
ret = static_cast<DeviceImp *>(this->device)->bcsSplit.appendSplitCall<gfxCoreFamily, void *, const void *>(this, dstptr, srcptr, size, hSignalEvent, numWaitEvents, phWaitEvents, true, relaxedOrderingDispatch, direction, [&](void *dstptrParam, const void *srcptrParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) {
return CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(dstptrParam, srcptrParam, sizeParam, hSignalEventParam, 0u, nullptr, relaxedOrderingDispatch, true);
});
@ -427,7 +438,8 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopy(
ret = CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(dstptr, srcptr, size, hSignalEvent,
numWaitEvents, phWaitEvents, relaxedOrderingDispatch, forceDisableCopyOnlyInOrderSignaling);
}
return flushImmediate(ret, true, false, relaxedOrderingDispatch, hSignalEvent);
return flushImmediate(ret, true, hasStallindCmds, relaxedOrderingDispatch, hSignalEvent);
}
template <GFXCORE_FAMILY gfxCoreFamily>
@ -450,12 +462,16 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopyRegio
checkWaitEventsState(numWaitEvents, phWaitEvents);
}
bool hasStallindCmds = hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch);
ze_result_t ret;
NEO::TransferDirection direction;
auto isSplitNeeded = this->isAppendSplitNeeded(dstPtr, srcPtr, this->getTotalSizeForCopyRegion(dstRegion, dstPitch, dstSlicePitch), direction);
if (isSplitNeeded) {
relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(1); // split generates more than 1 event
hasStallindCmds = !relaxedOrderingDispatch;
ret = static_cast<DeviceImp *>(this->device)->bcsSplit.appendSplitCall<gfxCoreFamily, uint32_t, uint32_t>(this, dstRegion->originX, srcRegion->originX, dstRegion->width, hSignalEvent, numWaitEvents, phWaitEvents, true, relaxedOrderingDispatch, direction, [&](uint32_t dstOriginXParam, uint32_t srcOriginXParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) {
ze_copy_region_t dstRegionLocal = {};
ze_copy_region_t srcRegionLocal = {};
@ -475,7 +491,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopyRegio
hSignalEvent, numWaitEvents, phWaitEvents, relaxedOrderingDispatch, forceDisableCopyOnlyInOrderSignaling);
}
return flushImmediate(ret, true, false, relaxedOrderingDispatch, hSignalEvent);
return flushImmediate(ret, true, hasStallindCmds, relaxedOrderingDispatch, hSignalEvent);
}
template <GFXCORE_FAMILY gfxCoreFamily>
@ -493,7 +509,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryFill(void
auto ret = CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(ptr, pattern, patternSize, size, hSignalEvent, numWaitEvents, phWaitEvents, relaxedOrderingDispatch);
return flushImmediate(ret, true, false, relaxedOrderingDispatch, hSignalEvent);
return flushImmediate(ret, true, hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch), relaxedOrderingDispatch, hSignalEvent);
}
template <GFXCORE_FAMILY gfxCoreFamily>
@ -620,7 +636,8 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendImageCopyRegion
auto ret = CommandListCoreFamily<gfxCoreFamily>::appendImageCopyRegion(hDstImage, hSrcImage, pDstRegion, pSrcRegion, hSignalEvent,
numWaitEvents, phWaitEvents, relaxedOrderingDispatch);
return flushImmediate(ret, true, false, relaxedOrderingDispatch, hSignalEvent);
return flushImmediate(ret, true, hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch), relaxedOrderingDispatch, hSignalEvent);
}
template <GFXCORE_FAMILY gfxCoreFamily>
@ -641,7 +658,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendImageCopyFromMe
auto ret = CommandListCoreFamily<gfxCoreFamily>::appendImageCopyFromMemory(hDstImage, srcPtr, pDstRegion, hSignalEvent,
numWaitEvents, phWaitEvents, relaxedOrderingDispatch);
return flushImmediate(ret, true, false, relaxedOrderingDispatch, hSignalEvent);
return flushImmediate(ret, true, hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch), relaxedOrderingDispatch, hSignalEvent);
}
template <GFXCORE_FAMILY gfxCoreFamily>
@ -662,7 +679,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendImageCopyToMemo
auto ret = CommandListCoreFamily<gfxCoreFamily>::appendImageCopyToMemory(dstPtr, hSrcImage, pSrcRegion, hSignalEvent,
numWaitEvents, phWaitEvents, relaxedOrderingDispatch);
return flushImmediate(ret, true, false, relaxedOrderingDispatch, hSignalEvent);
return flushImmediate(ret, true, hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch), relaxedOrderingDispatch, hSignalEvent);
}
template <GFXCORE_FAMILY gfxCoreFamily>
@ -694,7 +711,8 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendLaunchCooperati
}
auto ret = CommandListCoreFamily<gfxCoreFamily>::appendLaunchCooperativeKernel(kernelHandle, launchKernelArgs, hSignalEvent, numWaitEvents, waitEventHandles, relaxedOrderingDispatch);
return flushImmediate(ret, true, false, relaxedOrderingDispatch, hSignalEvent);
return flushImmediate(ret, true, hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch), relaxedOrderingDispatch, hSignalEvent);
}
template <GFXCORE_FAMILY gfxCoreFamily>

View File

@ -6,6 +6,7 @@
*/
#include "shared/source/command_stream/wait_status.h"
#include "shared/source/direct_submission/relaxed_ordering_helper.h"
#include "shared/source/gmm_helper/gmm_helper.h"
#include "shared/source/indirect_heap/indirect_heap.h"
#include "shared/source/memory_manager/internal_allocation_storage.h"
@ -1085,6 +1086,117 @@ HWTEST2_F(CommandListCreate, givenDirectSubmissionAndImmCmdListWhenDispatchingTh
driverHandle->releaseImportedPointer(dstPtr);
}
HWTEST2_F(CommandListCreate, givenDirectSubmissionAndImmCmdListWhenDispatchingDisabledRelaxedOrderingThenPassStallingCmdsInfo, IsAtLeastXeHpcCore) {
ze_command_queue_desc_t desc = {};
desc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
ze_result_t returnValue;
auto commandList = zeUniquePtr(CommandList::createImmediate(productFamily, device, &desc, false, NEO::EngineGroupType::RenderCompute, returnValue));
ASSERT_NE(nullptr, commandList);
auto whiteBoxCmdList = static_cast<CommandList *>(commandList.get());
ze_event_pool_desc_t eventPoolDesc = {};
eventPoolDesc.count = 1;
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE | ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
ze_event_desc_t eventDesc = {};
eventDesc.wait = ZE_EVENT_SCOPE_FLAG_HOST;
ze_event_handle_t event = nullptr;
std::unique_ptr<L0::EventPool> eventPool(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, returnValue));
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
ASSERT_EQ(ZE_RESULT_SUCCESS, eventPool->createEvent(&eventDesc, &event));
std::unique_ptr<L0::Event> eventObject(L0::Event::fromHandle(event));
Mock<::L0::Kernel> kernel;
ze_group_count_t groupCount{1, 1, 1};
CmdListKernelLaunchParams launchParams = {};
uint8_t srcPtr[64] = {};
uint8_t dstPtr[64] = {};
const ze_copy_region_t region = {0U, 0U, 0U, 1, 1, 0U};
driverHandle->importExternalPointer(dstPtr, MemoryConstants::pageSize);
auto ultCsr = static_cast<NEO::UltCommandStreamReceiver<FamilyType> *>(whiteBoxCmdList->csr);
ultCsr->recordFlusheBatchBuffer = true;
ultCsr->unregisterClient();
EXPECT_FALSE(NEO::RelaxedOrderingHelper::isRelaxedOrderingDispatchAllowed(*ultCsr, 1));
auto verifyFlags = [&ultCsr](ze_result_t result) {
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_TRUE(ultCsr->recordedDispatchFlags.hasStallingCmds);
EXPECT_TRUE(ultCsr->latestFlushedBatchBuffer.hasStallingCmds);
};
auto resetFlags = [&ultCsr]() {
ultCsr->recordedDispatchFlags.hasStallingCmds = false;
ultCsr->latestFlushedBatchBuffer.hasStallingCmds = false;
};
bool inOrderExecAlreadyEnabled = false;
for (bool inOrderExecution : {false, true}) {
if (inOrderExecution && !inOrderExecAlreadyEnabled) {
whiteBoxCmdList->enableInOrderExecution();
inOrderExecAlreadyEnabled = true;
}
EXPECT_EQ(inOrderExecAlreadyEnabled, inOrderExecution);
uint32_t numWaitEvents = inOrderExecution ? 0 : 1;
ze_event_handle_t *waitlist = inOrderExecution ? nullptr : &event;
// non-pipelined state or first in-order exec
resetFlags();
verifyFlags(commandList->appendLaunchKernel(kernel.toHandle(), &groupCount, nullptr, 1, &event, launchParams, false));
// non-pipelined state already programmed
resetFlags();
verifyFlags(commandList->appendLaunchKernel(kernel.toHandle(), &groupCount, nullptr, numWaitEvents, waitlist, launchParams, false));
resetFlags();
verifyFlags(commandList->appendLaunchKernelIndirect(kernel.toHandle(), &groupCount, nullptr, numWaitEvents, waitlist, false));
resetFlags();
verifyFlags(commandList->appendMemoryCopy(dstPtr, srcPtr, 8, nullptr, numWaitEvents, waitlist, false, false));
resetFlags();
verifyFlags(commandList->appendMemoryCopyRegion(dstPtr, &region, 0, 0, srcPtr, &region, 0, 0, nullptr, numWaitEvents, waitlist, false, false));
resetFlags();
verifyFlags(commandList->appendMemoryFill(dstPtr, srcPtr, 8, 1, nullptr, numWaitEvents, waitlist, false));
if constexpr (FamilyType::supportsSampler) {
auto kernel = device->getBuiltinFunctionsLib()->getImageFunction(ImageBuiltin::CopyImageRegion);
auto mockBuiltinKernel = static_cast<Mock<::L0::Kernel> *>(kernel);
mockBuiltinKernel->setArgRedescribedImageCallBase = false;
auto image = std::make_unique<WhiteBox<::L0::ImageCoreFamily<gfxCoreFamily>>>();
ze_image_region_t imgRegion = {1, 1, 1, 1, 1, 1};
ze_image_desc_t zeDesc = {};
zeDesc.stype = ZE_STRUCTURE_TYPE_IMAGE_DESC;
image->initialize(device, &zeDesc);
resetFlags();
verifyFlags(commandList->appendImageCopyRegion(image->toHandle(), image->toHandle(), &imgRegion, &imgRegion, nullptr, numWaitEvents, waitlist, false));
resetFlags();
verifyFlags(commandList->appendImageCopyFromMemory(image->toHandle(), dstPtr, &imgRegion, nullptr, numWaitEvents, waitlist, false));
resetFlags();
verifyFlags(commandList->appendImageCopyToMemory(dstPtr, image->toHandle(), &imgRegion, nullptr, numWaitEvents, waitlist, false));
}
resetFlags();
verifyFlags(commandList->appendLaunchCooperativeKernel(kernel.toHandle(), &groupCount, nullptr, numWaitEvents, waitlist, false));
}
driverHandle->releaseImportedPointer(dstPtr);
}
HWTEST2_F(CommandListCreate, whenDispatchingThenPassNumCsrClients, IsAtLeastXeHpcCore) {
ze_command_queue_desc_t desc = {};
desc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS;