diff --git a/level_zero/core/source/cmdlist/cmdlist.cpp b/level_zero/core/source/cmdlist/cmdlist.cpp index cd5481e8e6..58cd49d748 100644 --- a/level_zero/core/source/cmdlist/cmdlist.cpp +++ b/level_zero/core/source/cmdlist/cmdlist.cpp @@ -171,15 +171,22 @@ void CommandList::migrateSharedAllocations() { } } -bool CommandList::setupTimestampEventForMultiTile(Event *signalEvent) { +bool CommandList::isTimestampEventForMultiTile(Event *signalEvent) { if (this->partitionCount > 1 && signalEvent) { if (signalEvent->isEventTimestampFlagSet()) { - signalEvent->setPacketsInUse(this->partitionCount); return true; } } return false; } +bool CommandList::setupTimestampEventForMultiTile(Event *signalEvent) { + if (isTimestampEventForMultiTile(signalEvent)) { + signalEvent->setPacketsInUse(this->partitionCount); + return true; + } + return false; +} + } // namespace L0 diff --git a/level_zero/core/source/cmdlist/cmdlist.h b/level_zero/core/source/cmdlist/cmdlist.h index 473252b3dc..e5b6e8da95 100644 --- a/level_zero/core/source/cmdlist/cmdlist.h +++ b/level_zero/core/source/cmdlist/cmdlist.h @@ -308,6 +308,7 @@ struct CommandList : _ze_command_list_handle_t { NEO::GraphicsAllocation *getAllocationFromHostPtrMap(const void *buffer, uint64_t bufferSize); NEO::GraphicsAllocation *getHostPtrAlloc(const void *buffer, uint64_t bufferSize, bool hostCopyAllowed); bool setupTimestampEventForMultiTile(Event *signalEvent); + bool isTimestampEventForMultiTile(Event *signalEvent); bool getDcFlushRequired(bool externalCondition) const { return externalCondition ? dcFlushSupport : false; } diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.h b/level_zero/core/source/cmdlist/cmdlist_hw.h index 66ac6bdc66..04058b0bc4 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw.h @@ -43,6 +43,17 @@ struct AlignedAllocationData { bool needsFlush = false; }; +struct CmdListFillKernelArguments { + size_t mainOffset = 0; + size_t mainGroupSize = 0; + size_t groups = 0; + size_t rightOffset = 0; + size_t patternOffsetRemainder = 0; + uint32_t leftRemainingBytes = 0; + uint32_t rightRemainingBytes = 0; + uint32_t patternSizeInEls = 0; +}; + struct EventPool; struct Event; @@ -157,7 +168,7 @@ struct CommandListCoreFamily : CommandListImp { uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override; void appendMultiPartitionPrologue(uint32_t partitionDataSize) override; void appendMultiPartitionEpilogue() override; - void appendEventForProfilingAllWalkers(Event *event, bool beforeWalker); + void appendEventForProfilingAllWalkers(Event *event, bool beforeWalker, bool singlePacketEvent); ze_result_t addEventsToCmdList(uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents); ze_result_t reserveSpace(size_t size, void **ptr) override; @@ -173,7 +184,8 @@ struct CommandListCoreFamily : CommandListImp { uint64_t srcOffset, uint64_t size, uint64_t elementSize, Builtin builtin, Event *signalEvent, - bool isStateless); + bool isStateless, + CmdListKernelLaunchParams &launchParams); MOCKABLE_VIRTUAL ze_result_t appendMemoryCopyBlit(uintptr_t dstPtr, NEO::GraphicsAllocation *dstPtrAlloc, @@ -270,9 +282,15 @@ struct CommandListCoreFamily : CommandListImp { void handlePostSubmissionState(); virtual void createLogicalStateHelper(); + void setupFillKernelArguments(size_t baseOffset, + size_t patternSize, + size_t dstSize, + CmdListFillKernelArguments &outArguments, + Kernel *kernel); size_t cmdListCurrentStartOffset = 0; bool containsAnyKernel = false; + bool pipeControlMultiKernelEventSync = false; }; template diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index 715628e696..9fec05193f 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -136,6 +136,7 @@ ze_result_t CommandListCoreFamily::initialize(Device *device, NEO this->stateComputeModeTracking = L0HwHelper::enableStateComputeModeTracking(hwInfo); this->frontEndStateTracking = L0HwHelper::enableFrontEndStateTracking(hwInfo); this->pipelineSelectStateTracking = L0HwHelper::enablePipelineSelectStateTracking(hwInfo); + this->pipeControlMultiKernelEventSync = L0HwHelper::usePipeControlMultiKernelEventSync(hwInfo); if (device->isImplicitScalingCapable() && !this->internalUsage && !isCopyOnly()) { this->partitionCount = static_cast(this->device->getNEODevice()->getDeviceBitfield().count()); @@ -926,7 +927,8 @@ ze_result_t CommandListCoreFamily::appendMemoryCopyKernelWithGA(v uint64_t elementSize, Builtin builtin, Event *signalEvent, - bool isStateless) { + bool isStateless, + CmdListKernelLaunchParams &launchParams) { auto lock = device->getBuiltinFunctionsLib()->obtainUniqueOwnership(); @@ -957,8 +959,6 @@ ze_result_t CommandListCoreFamily::appendMemoryCopyKernelWithGA(v ze_group_count_t dispatchKernelArgs{groups, 1u, 1u}; auto dstAllocationType = dstPtrAlloc->getAllocationType(); - CmdListKernelLaunchParams launchParams = {}; - launchParams.isKernelSplitOperation = true; launchParams.isBuiltInKernel = true; launchParams.isDestinationAllocationInSystemMemory = (dstAllocationType == NEO::AllocationType::BUFFER_HOST_MEMORY) || @@ -1088,6 +1088,8 @@ ze_result_t CommandListCoreFamily::appendPageFaultCopy(NEO::Graph srcAddress, srcAllocation, 0u, size); } else { + CmdListKernelLaunchParams launchParams = {}; + launchParams.isKernelSplitOperation = rightSize > 1; ret = appendMemoryCopyKernelWithGA(reinterpret_cast(&dstAddress), dstAllocation, 0, reinterpret_cast(&srcAddress), @@ -1096,7 +1098,8 @@ ze_result_t CommandListCoreFamily::appendPageFaultCopy(NEO::Graph middleElSize, Builtin::CopyBufferToBufferMiddle, nullptr, - isStateless); + isStateless, + launchParams); if (ret == ZE_RESULT_SUCCESS && rightSize) { ret = appendMemoryCopyKernelWithGA(reinterpret_cast(&dstAddress), dstAllocation, size - rightSize, @@ -1105,7 +1108,8 @@ ze_result_t CommandListCoreFamily::appendPageFaultCopy(NEO::Graph rightSize, 1UL, Builtin::CopyBufferToBufferSide, nullptr, - isStateless); + isStateless, + launchParams); } if (this->dcFlushSupport) { @@ -1183,7 +1187,16 @@ ze_result_t CommandListCoreFamily::appendMemoryCopy(void *dstptr, signalEvent = Event::fromHandle(hSignalEvent); } - appendEventForProfilingAllWalkers(signalEvent, true); + uint32_t kernelCounter = leftSize > 0 ? 1 : 0; + kernelCounter += middleSizeBytes > 0 ? 1 : 0; + kernelCounter += rightSize > 0 ? 1 : 0; + + CmdListKernelLaunchParams launchParams = {}; + + launchParams.isKernelSplitOperation = kernelCounter > 1; + bool singlePipeControlPacket = this->pipeControlMultiKernelEventSync && launchParams.isKernelSplitOperation; + + appendEventForProfilingAllWalkers(signalEvent, true, singlePipeControlPacket); if (ret == ZE_RESULT_SUCCESS && leftSize) { Builtin copyKernel = Builtin::CopyBufferToBufferSide; @@ -1203,7 +1216,8 @@ ze_result_t CommandListCoreFamily::appendMemoryCopy(void *dstptr, leftSize, 1UL, copyKernel, signalEvent, - isStateless); + isStateless, + launchParams); } } @@ -1226,7 +1240,8 @@ ze_result_t CommandListCoreFamily::appendMemoryCopy(void *dstptr, middleElSize, copyKernel, signalEvent, - isStateless); + isStateless, + launchParams); } } @@ -1248,11 +1263,12 @@ ze_result_t CommandListCoreFamily::appendMemoryCopy(void *dstptr, rightSize, 1UL, copyKernel, signalEvent, - isStateless); + isStateless, + launchParams); } } - appendEventForProfilingAllWalkers(signalEvent, false); + appendEventForProfilingAllWalkers(signalEvent, false, singlePipeControlPacket); addFlushRequiredCommand(dstAllocationStruct.needsFlush, signalEvent); if (NEO::DebugManager.flags.EnableSWTags.get()) { @@ -1564,86 +1580,70 @@ ze_result_t CommandListCoreFamily::appendMemoryFill(void *ptr, } auto lock = device->getBuiltinFunctionsLib()->obtainUniqueOwnership(); - CmdListKernelLaunchParams launchParams = {}; - launchParams.isKernelSplitOperation = true; - launchParams.isBuiltInKernel = true; - launchParams.isDestinationAllocationInSystemMemory = hostPointerNeedsFlush; + Kernel *builtinKernel = nullptr; if (patternSize == 1) { - size_t middleSize = size; - uint32_t leftRemainder = sizeof(uint32_t) - (dstAllocation.offset % sizeof(uint32_t)); - if (dstAllocation.offset % sizeof(uint32_t) != 0 && leftRemainder <= size) { - res = appendUnalignedFillKernel(isStateless, leftRemainder, dstAllocation, pattern, signalEvent, launchParams); - if (res) { - return res; - } - middleSize -= leftRemainder; - dstAllocation.offset += leftRemainder; - } - Kernel *builtinKernel = nullptr; - if (isStateless) { builtinKernel = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferImmediateStateless); } else { builtinKernel = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferImmediate); } - const auto dataTypeSize = sizeof(uint32_t) * 4; - size_t adjustedSize = middleSize / dataTypeSize; - size_t groupSizeX = device->getDeviceInfo().maxWorkGroupSize; - if (groupSizeX > adjustedSize && adjustedSize > 0) { - groupSizeX = adjustedSize; + } else { + if (isStateless) { + builtinKernel = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferMiddleStateless); + } else { + builtinKernel = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferMiddle); } - if (builtinKernel->setGroupSize(static_cast(groupSizeX), 1u, 1u)) { + } + + CmdListKernelLaunchParams launchParams = {}; + launchParams.isBuiltInKernel = true; + launchParams.isDestinationAllocationInSystemMemory = hostPointerNeedsFlush; + + CmdListFillKernelArguments fillArguments = {}; + setupFillKernelArguments(dstAllocation.offset, patternSize, size, fillArguments, builtinKernel); + + launchParams.isKernelSplitOperation = (fillArguments.leftRemainingBytes > 0 || fillArguments.rightRemainingBytes > 0); + bool singlePipeControlPacket = this->pipeControlMultiKernelEventSync && launchParams.isKernelSplitOperation; + + appendEventForProfilingAllWalkers(signalEvent, true, singlePipeControlPacket); + + if (patternSize == 1) { + if (fillArguments.leftRemainingBytes > 0) { + res = appendUnalignedFillKernel(isStateless, fillArguments.leftRemainingBytes, dstAllocation, pattern, signalEvent, launchParams); + if (res) { + return res; + } + } + + if (builtinKernel->setGroupSize(static_cast(fillArguments.mainGroupSize), 1u, 1u)) { DEBUG_BREAK_IF(true); return ZE_RESULT_ERROR_UNKNOWN; } - size_t groups = adjustedSize / groupSizeX; - uint32_t remainingBytes = static_cast((adjustedSize % groupSizeX) * dataTypeSize + - middleSize % dataTypeSize); - ze_group_count_t dispatchKernelArgs{static_cast(groups), 1u, 1u}; + ze_group_count_t dispatchKernelArgs{static_cast(fillArguments.groups), 1u, 1u}; uint32_t value = 0; memset(&value, *reinterpret_cast(pattern), 4); builtinKernel->setArgBufferWithAlloc(0, dstAllocation.alignedAllocationPtr, dstAllocation.alloc); - builtinKernel->setArgumentValue(1, sizeof(dstAllocation.offset), &dstAllocation.offset); + builtinKernel->setArgumentValue(1, sizeof(fillArguments.mainOffset), &fillArguments.mainOffset); builtinKernel->setArgumentValue(2, sizeof(value), &value); - appendEventForProfilingAllWalkers(signalEvent, true); - res = appendLaunchKernelSplit(builtinKernel, &dispatchKernelArgs, signalEvent, launchParams); if (res) { return res; } - if (remainingBytes) { - dstAllocation.offset += (middleSize - remainingBytes); - res = appendUnalignedFillKernel(isStateless, remainingBytes, dstAllocation, pattern, signalEvent, launchParams); + if (fillArguments.rightRemainingBytes > 0) { + dstAllocation.offset = fillArguments.rightOffset; + res = appendUnalignedFillKernel(isStateless, fillArguments.rightRemainingBytes, dstAllocation, pattern, signalEvent, launchParams); if (res) { return res; } } } else { - - Kernel *builtinKernel = nullptr; - if (isStateless) { - builtinKernel = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferMiddleStateless); - } else { - builtinKernel = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferMiddle); - } - size_t middleElSize = sizeof(uint32_t); - size_t adjustedSize = size / middleElSize; - uint32_t groupSizeX = static_cast(adjustedSize); - uint32_t groupSizeY = 1, groupSizeZ = 1; - builtinKernel->suggestGroupSize(groupSizeX, groupSizeY, groupSizeZ, &groupSizeX, &groupSizeY, &groupSizeZ); - builtinKernel->setGroupSize(groupSizeX, groupSizeY, groupSizeZ); - - uint32_t groups = static_cast(adjustedSize) / groupSizeX; - uint32_t remainingBytes = static_cast((adjustedSize % groupSizeX) * middleElSize + - size % middleElSize); + builtinKernel->setGroupSize(static_cast(fillArguments.mainGroupSize), 1, 1); size_t patternAllocationSize = alignUp(patternSize, MemoryConstants::cacheLineSize); - uint32_t patternSizeInEls = static_cast(patternAllocationSize / middleElSize); - auto patternGfxAlloc = device->obtainReusableAllocation(patternAllocationSize, NEO::AllocationType::FILL_PATTERN); if (patternGfxAlloc == nullptr) { patternGfxAlloc = device->getDriverHandle()->getMemoryManager()->allocateGraphicsMemoryWithProperties({device->getNEODevice()->getRootDeviceIndex(), @@ -1666,22 +1666,21 @@ ze_result_t CommandListCoreFamily::appendMemoryFill(void *ptr, patternAllocOffset += patternSizeToCopy; } while (patternAllocOffset < patternAllocationSize); + builtinKernel->setArgBufferWithAlloc(0, dstAllocation.alignedAllocationPtr, dstAllocation.alloc); builtinKernel->setArgumentValue(1, sizeof(dstAllocation.offset), &dstAllocation.offset); builtinKernel->setArgBufferWithAlloc(2, reinterpret_cast(patternGfxAllocPtr), patternGfxAlloc); - builtinKernel->setArgumentValue(3, sizeof(patternSizeInEls), &patternSizeInEls); + builtinKernel->setArgumentValue(3, sizeof(fillArguments.patternSizeInEls), &fillArguments.patternSizeInEls); - appendEventForProfilingAllWalkers(signalEvent, true); - - ze_group_count_t dispatchKernelArgs{groups, 1u, 1u}; + ze_group_count_t dispatchKernelArgs{static_cast(fillArguments.groups), 1u, 1u}; res = appendLaunchKernelSplit(builtinKernel, &dispatchKernelArgs, signalEvent, launchParams); if (res) { return res; } - if (remainingBytes) { - uint32_t dstOffsetRemainder = groups * groupSizeX * static_cast(middleElSize); - uint64_t patternOffsetRemainder = (groupSizeX * groups & (patternSizeInEls - 1)) * middleElSize; + if (fillArguments.rightRemainingBytes > 0) { + uint32_t dstOffsetRemainder = static_cast(fillArguments.rightOffset); + uint64_t patternOffsetRemainder = fillArguments.patternOffsetRemainder; Kernel *builtinKernelRemainder; if (isStateless) { @@ -1690,7 +1689,7 @@ ze_result_t CommandListCoreFamily::appendMemoryFill(void *ptr, builtinKernelRemainder = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferRightLeftover); } - builtinKernelRemainder->setGroupSize(remainingBytes, 1u, 1u); + builtinKernelRemainder->setGroupSize(fillArguments.rightRemainingBytes, 1u, 1u); ze_group_count_t dispatchKernelArgs{1u, 1u, 1u}; builtinKernelRemainder->setArgBufferWithAlloc(0, @@ -1711,7 +1710,7 @@ ze_result_t CommandListCoreFamily::appendMemoryFill(void *ptr, } } - appendEventForProfilingAllWalkers(signalEvent, false); + appendEventForProfilingAllWalkers(signalEvent, false, singlePipeControlPacket); addFlushRequiredCommand(hostPointerNeedsFlush, signalEvent); if (NEO::DebugManager.flags.EnableSWTags.get()) { @@ -2544,4 +2543,57 @@ void CommandListCoreFamily::addFlushRequiredCommand(bool flushOpe } } +template +void CommandListCoreFamily::setupFillKernelArguments(size_t baseOffset, + size_t patternSize, + size_t dstSize, + CmdListFillKernelArguments &outArguments, + Kernel *kernel) { + if (patternSize == 1) { + size_t middleSize = dstSize; + outArguments.mainOffset = baseOffset; + outArguments.leftRemainingBytes = sizeof(uint32_t) - (baseOffset % sizeof(uint32_t)); + if (baseOffset % sizeof(uint32_t) != 0 && outArguments.leftRemainingBytes <= dstSize) { + middleSize -= outArguments.leftRemainingBytes; + outArguments.mainOffset += outArguments.leftRemainingBytes; + } else { + outArguments.leftRemainingBytes = 0; + } + + const auto dataTypeSize = sizeof(uint32_t) * 4; + size_t adjustedSize = middleSize / dataTypeSize; + outArguments.mainGroupSize = this->device->getDeviceInfo().maxWorkGroupSize; + if (outArguments.mainGroupSize > adjustedSize && adjustedSize > 0) { + outArguments.mainGroupSize = adjustedSize; + } + + outArguments.groups = adjustedSize / outArguments.mainGroupSize; + outArguments.rightRemainingBytes = static_cast((adjustedSize % outArguments.mainGroupSize) * dataTypeSize + + middleSize % dataTypeSize); + + if (outArguments.rightRemainingBytes > 0) { + outArguments.rightOffset = outArguments.mainOffset + (middleSize - outArguments.rightRemainingBytes); + } + } else { + size_t middleElSize = sizeof(uint32_t); + size_t adjustedSize = dstSize / middleElSize; + uint32_t groupSizeX = static_cast(adjustedSize); + uint32_t groupSizeY = 1, groupSizeZ = 1; + kernel->suggestGroupSize(groupSizeX, groupSizeY, groupSizeZ, &groupSizeX, &groupSizeY, &groupSizeZ); + outArguments.mainGroupSize = groupSizeX; + + outArguments.groups = static_cast(adjustedSize) / outArguments.mainGroupSize; + outArguments.rightRemainingBytes = static_cast((adjustedSize % outArguments.mainGroupSize) * middleElSize + + dstSize % middleElSize); + + size_t patternAllocationSize = alignUp(patternSize, MemoryConstants::cacheLineSize); + outArguments.patternSizeInEls = static_cast(patternAllocationSize / middleElSize); + + if (outArguments.rightRemainingBytes > 0) { + outArguments.rightOffset = outArguments.groups * outArguments.mainGroupSize * middleElSize; + outArguments.patternOffsetRemainder = (outArguments.mainGroupSize * outArguments.groups & (outArguments.patternSizeInEls - 1)) * middleElSize; + } + } +} + } // namespace L0 diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_base.inl b/level_zero/core/source/cmdlist/cmdlist_hw_base.inl index 485105f17a..1861974945 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_base.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_base.inl @@ -246,7 +246,7 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelSplit(Kernel } template -void CommandListCoreFamily::appendEventForProfilingAllWalkers(Event *event, bool beforeWalker) { +void CommandListCoreFamily::appendEventForProfilingAllWalkers(Event *event, bool beforeWalker, bool singlePacketEvent) { if (beforeWalker) { appendEventForProfiling(event, true, false); } else { diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl index c12724b826..b5bdc4b529 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl @@ -404,25 +404,31 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelSplit(Kernel Event *event, const CmdListKernelLaunchParams &launchParams) { if (event) { - event->increaseKernelCount(); + if (this->pipeControlMultiKernelEventSync && launchParams.isKernelSplitOperation) { + event = nullptr; + } else { + event->increaseKernelCount(); + } } return appendLaunchKernelWithParams(kernel, threadGroupDimensions, event, launchParams); } template -void CommandListCoreFamily::appendEventForProfilingAllWalkers(Event *event, bool beforeWalker) { - if (isCopyOnly()) { +void CommandListCoreFamily::appendEventForProfilingAllWalkers(Event *event, bool beforeWalker, bool singlePacketEvent) { + if (isCopyOnly() || singlePacketEvent) { if (beforeWalker) { - appendEventForProfiling(event, true, false); + bool workloadPartition = setupTimestampEventForMultiTile(event); + appendEventForProfiling(event, true, workloadPartition); } else { - appendSignalEventPostWalker(event, false); + bool workloadPartition = isTimestampEventForMultiTile(event); + appendSignalEventPostWalker(event, workloadPartition); } } else { if (event) { if (beforeWalker) { event->zeroKernelCount(); } else { - if (getDcFlushRequired(!!event->signalScope)) { + if (event->getKernelCount() > 1 && getDcFlushRequired(!!event->signalScope)) { programEventL3Flush(event, this->device, this->partitionCount, this->commandContainer); } } diff --git a/level_zero/core/source/device/bcs_split.h b/level_zero/core/source/device/bcs_split.h index 76df903ed3..ee99f29d96 100644 --- a/level_zero/core/source/device/bcs_split.h +++ b/level_zero/core/source/device/bcs_split.h @@ -62,7 +62,7 @@ struct BcsSplit { ze_result_t result = ZE_RESULT_SUCCESS; if (hSignalEvent) { - cmdList->appendEventForProfilingAllWalkers(Event::fromHandle(hSignalEvent), true); + cmdList->appendEventForProfilingAllWalkers(Event::fromHandle(hSignalEvent), true, true); } auto markerEventIndex = this->events.obtainForSplit(Context::fromHandle(cmdList->hContext), MemoryConstants::pageSize64k / sizeof(typename CommandListCoreFamilyImmediate::GfxFamily::TimestampPacketType)); @@ -86,10 +86,10 @@ struct BcsSplit { } cmdList->addEventsToCmdList(static_cast(this->cmdQs.size()), eventHandles.data()); - cmdList->appendEventForProfilingAllWalkers(this->events.marker[markerEventIndex], false); + cmdList->appendEventForProfilingAllWalkers(this->events.marker[markerEventIndex], false, true); if (hSignalEvent) { - cmdList->appendEventForProfilingAllWalkers(Event::fromHandle(hSignalEvent), false); + cmdList->appendEventForProfilingAllWalkers(Event::fromHandle(hSignalEvent), false, true); } return result; diff --git a/level_zero/core/source/hw_helpers/l0_hw_helper.cpp b/level_zero/core/source/hw_helpers/l0_hw_helper.cpp index fa0ed2fef3..ec2ee273e6 100644 --- a/level_zero/core/source/hw_helpers/l0_hw_helper.cpp +++ b/level_zero/core/source/hw_helpers/l0_hw_helper.cpp @@ -46,4 +46,11 @@ bool L0HwHelper::enableImmediateCmdListHeapSharing(const NEO::HardwareInfo &hwIn return platformSupport && cmdlistSupport; } +bool L0HwHelper::usePipeControlMultiKernelEventSync(const NEO::HardwareInfo &hwInfo) { + if (NEO::DebugManager.flags.UsePipeControlMultiKernelEventSync.get() != -1) { + return !!NEO::DebugManager.flags.UsePipeControlMultiKernelEventSync.get(); + } + return false; +} + } // namespace L0 diff --git a/level_zero/core/source/hw_helpers/l0_hw_helper.h b/level_zero/core/source/hw_helpers/l0_hw_helper.h index 58468987f4..b512da30b6 100644 --- a/level_zero/core/source/hw_helpers/l0_hw_helper.h +++ b/level_zero/core/source/hw_helpers/l0_hw_helper.h @@ -34,6 +34,7 @@ class L0HwHelper { static bool enablePipelineSelectStateTracking(const NEO::HardwareInfo &hwInfo); static bool enableStateComputeModeTracking(const NEO::HardwareInfo &hwInfo); static bool enableImmediateCmdListHeapSharing(const NEO::HardwareInfo &hwInfo, bool cmdlistSupport); + static bool usePipeControlMultiKernelEventSync(const NEO::HardwareInfo &hwInfo); virtual void setAdditionalGroupProperty(ze_command_queue_group_properties_t &groupProperty, NEO::EngineGroupT &group) const = 0; virtual L0::Event *createEvent(L0::EventPool *eventPool, const ze_event_desc_t *desc, L0::Device *device) const = 0; diff --git a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h index e66ad25d3e..629e13a016 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h +++ b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h @@ -65,6 +65,7 @@ struct WhiteBox<::L0::CommandListCoreFamily> using BaseClass::initialize; using BaseClass::partitionCount; using BaseClass::patternAllocations; + using BaseClass::pipeControlMultiKernelEventSync; using BaseClass::pipelineSelectStateTracking; using BaseClass::requiredStreamState; using BaseClass::stateComputeModeTracking; @@ -130,6 +131,7 @@ struct WhiteBox> using BaseClass::immediateCmdListHeapSharing; using BaseClass::isFlushTaskSubmissionEnabled; using BaseClass::partitionCount; + using BaseClass::pipeControlMultiKernelEventSync; using BaseClass::pipelineSelectStateTracking; using BaseClass::requiredStreamState; using BaseClass::stateComputeModeTracking; @@ -143,6 +145,7 @@ struct MockCommandListImmediate : public CommandListCoreFamilyImmediate { uint64_t srcOffset, uint64_t size, uint64_t elementSize, Builtin builtin, Event *signalEvent, - bool isStateless), - (dstPtr, dstPtrAlloc, dstOffset, srcPtr, srcPtrAlloc, srcOffset, size, elementSize, builtin, signalEvent, isStateless)); + bool isStateless, + CmdListKernelLaunchParams &launchParams), + (dstPtr, dstPtrAlloc, dstOffset, srcPtr, srcPtrAlloc, srcOffset, size, elementSize, builtin, signalEvent, isStateless, launchParams)); ADDMETHOD_NOBASE(appendMemoryCopyBlit, ze_result_t, ZE_RESULT_SUCCESS, (uintptr_t dstPtr, diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp index 5b3ab36578..614107da46 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp @@ -50,7 +50,8 @@ class MockCommandListHw : public WhiteBox<::L0::CommandListCoreFamilyappendMemoryCopy(dstPtr, srcPtr, 8, nullptr, 0, nullptr); EXPECT_TRUE(commandList->usedKernelLaunchParams.isBuiltInKernel); - EXPECT_TRUE(commandList->usedKernelLaunchParams.isKernelSplitOperation); + EXPECT_FALSE(commandList->usedKernelLaunchParams.isKernelSplitOperation); EXPECT_TRUE(commandList->usedKernelLaunchParams.isDestinationAllocationInSystemMemory); } @@ -593,7 +593,7 @@ HWTEST2_F(CommandListTest, givenComputeCommandListWhenMemoryCopyInUsmHostAllocat commandList->appendMemoryCopy(dstBuffer, srcPtr, 8, nullptr, 0, nullptr); EXPECT_TRUE(commandList->usedKernelLaunchParams.isBuiltInKernel); - EXPECT_TRUE(commandList->usedKernelLaunchParams.isKernelSplitOperation); + EXPECT_FALSE(commandList->usedKernelLaunchParams.isKernelSplitOperation); EXPECT_TRUE(commandList->usedKernelLaunchParams.isDestinationAllocationInSystemMemory); context->freeMem(dstBuffer); @@ -617,7 +617,7 @@ HWTEST2_F(CommandListTest, givenComputeCommandListWhenMemoryCopyInUsmDeviceAlloc commandList->appendMemoryCopy(dstBuffer, srcPtr, 8, nullptr, 0, nullptr); EXPECT_TRUE(commandList->usedKernelLaunchParams.isBuiltInKernel); - EXPECT_TRUE(commandList->usedKernelLaunchParams.isKernelSplitOperation); + EXPECT_FALSE(commandList->usedKernelLaunchParams.isKernelSplitOperation); EXPECT_FALSE(commandList->usedKernelLaunchParams.isDestinationAllocationInSystemMemory); context->freeMem(dstBuffer); @@ -638,7 +638,12 @@ HWTEST2_F(CommandListTest, givenComputeCommandListWhenMemoryFillInUsmHostThenBui commandList->appendMemoryFill(dstBuffer, pattern, patternSize, allocSize, nullptr, 0, nullptr); EXPECT_TRUE(commandList->usedKernelLaunchParams.isBuiltInKernel); - EXPECT_TRUE(commandList->usedKernelLaunchParams.isKernelSplitOperation); + EXPECT_FALSE(commandList->usedKernelLaunchParams.isKernelSplitOperation); + EXPECT_TRUE(commandList->usedKernelLaunchParams.isDestinationAllocationInSystemMemory); + + commandList->appendMemoryFill(dstBuffer, pattern, 1, allocSize, nullptr, 0, nullptr); + EXPECT_TRUE(commandList->usedKernelLaunchParams.isBuiltInKernel); + EXPECT_FALSE(commandList->usedKernelLaunchParams.isKernelSplitOperation); EXPECT_TRUE(commandList->usedKernelLaunchParams.isDestinationAllocationInSystemMemory); context->freeMem(dstBuffer); @@ -663,6 +668,43 @@ HWTEST2_F(CommandListTest, givenComputeCommandListWhenMemoryFillInUsmDeviceThenB commandList->appendMemoryFill(dstBuffer, pattern, patternSize, size, nullptr, 0, nullptr); EXPECT_TRUE(commandList->usedKernelLaunchParams.isBuiltInKernel); + EXPECT_FALSE(commandList->usedKernelLaunchParams.isKernelSplitOperation); + EXPECT_FALSE(commandList->usedKernelLaunchParams.isDestinationAllocationInSystemMemory); + + commandList->appendMemoryFill(dstBuffer, pattern, 1, size, nullptr, 0, nullptr); + EXPECT_TRUE(commandList->usedKernelLaunchParams.isBuiltInKernel); + EXPECT_FALSE(commandList->usedKernelLaunchParams.isKernelSplitOperation); + EXPECT_FALSE(commandList->usedKernelLaunchParams.isDestinationAllocationInSystemMemory); + + context->freeMem(dstBuffer); +} + +HWTEST2_F(CommandListTest, givenComputeCommandListWhenMemoryFillRequiresMultiKernelsThenSplitFlagIsSet, IsAtLeastSkl) { + auto commandList = std::make_unique>>(); + commandList->initialize(device, NEO::EngineGroupType::RenderCompute, 0u); + + constexpr size_t patternSize = 8; + uint8_t pattern[patternSize] = {1, 2, 3, 4}; + + constexpr size_t size = 4096u; + constexpr size_t alignment = 4096u; + void *dstBuffer = nullptr; + + ze_device_mem_alloc_desc_t deviceDesc = {}; + auto result = context->allocDeviceMem(device->toHandle(), + &deviceDesc, + size, alignment, &dstBuffer); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + constexpr size_t fillSize = size - 1; + + commandList->appendMemoryFill(dstBuffer, pattern, patternSize, fillSize, nullptr, 0, nullptr); + EXPECT_TRUE(commandList->usedKernelLaunchParams.isBuiltInKernel); + EXPECT_TRUE(commandList->usedKernelLaunchParams.isKernelSplitOperation); + EXPECT_FALSE(commandList->usedKernelLaunchParams.isDestinationAllocationInSystemMemory); + + commandList->appendMemoryFill(dstBuffer, pattern, 1, fillSize, nullptr, 0, nullptr); + EXPECT_TRUE(commandList->usedKernelLaunchParams.isBuiltInKernel); EXPECT_TRUE(commandList->usedKernelLaunchParams.isKernelSplitOperation); EXPECT_FALSE(commandList->usedKernelLaunchParams.isDestinationAllocationInSystemMemory); diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_fill.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_fill.cpp index d5b11a0124..739678b63e 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_fill.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_fill.cpp @@ -89,6 +89,8 @@ class AppendFillFixture : public DeviceFixture { delete[] dstPtr; } + DebugManagerStateRestore restorer; + std::unique_ptr> driverHandle; NEO::MockDevice *neoDevice = nullptr; L0::Device *device = nullptr; @@ -108,8 +110,6 @@ struct MultiTileAppendFillFixture : public AppendFillFixture { DebugManager.flags.EnableImplicitScaling.set(1); AppendFillFixture::setUp(); } - - DebugManagerStateRestore restorer; }; using AppendFillTest = Test; @@ -480,203 +480,612 @@ HWTEST2_F(AppendFillTest, false); } -HWTEST2_F(AppendFillTest, - givenCallToAppendMemoryFillWithImmediateValueWhenTimestampEventUsesComputeWalkerPostSyncThenSeparateKernelsUsesPostSyncProfiling, IsAtLeastXeHpCore) { - using GfxFamily = typename NEO::GfxFamilyMapper::GfxFamily; - using COMPUTE_WALKER = typename GfxFamily::COMPUTE_WALKER; - using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA; +template +struct AppendFillMultiPacketEventFixture : public AppendFillFixture { + void setUp() { + DebugManager.flags.UsePipeControlMultiKernelEventSync.set(usePipeControlMultiPacketEventSync); + AppendFillFixture::setUp(); + } - ze_event_pool_desc_t eventPoolDesc = {}; - eventPoolDesc.count = 1; - eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + template + void testAppendMemoryFillManyImmediateKernels() { + using FamilyType = typename NEO::GfxFamilyMapper::GfxFamily; + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA; + using OPERATION = typename POSTSYNC_DATA::OPERATION; - ze_event_desc_t eventDesc = {}; - eventDesc.index = 0; + ze_event_pool_desc_t eventPoolDesc = {}; + eventPoolDesc.count = 1; + eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; - ze_result_t result = ZE_RESULT_SUCCESS; - auto eventPool = std::unique_ptr(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result)); - EXPECT_EQ(ZE_RESULT_SUCCESS, result); - auto event = std::unique_ptr(L0::Event::create(eventPool.get(), &eventDesc, device)); + ze_event_desc_t eventDesc = {}; + eventDesc.index = 0; - uint64_t firstKernelEventAddress = event->getGpuAddress(device); - uint64_t secondKernelEventAddress = event->getGpuAddress(device) + event->getSinglePacketSize(); + ze_result_t result = ZE_RESULT_SUCCESS; + auto eventPool = std::unique_ptr(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result)); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + auto event = std::unique_ptr(L0::Event::create(eventPool.get(), &eventDesc, device)); - auto commandList = std::make_unique>>(); - commandList->initialize(device, NEO::EngineGroupType::RenderCompute, 0u); - auto &commandContainer = commandList->commandContainer; + uint64_t firstKernelEventAddress = postSyncAddressZero ? 0 : event->getGpuAddress(device); + uint64_t secondKernelEventAddress = postSyncAddressZero ? 0 : event->getGpuAddress(device) + event->getSinglePacketSize(); - size_t usedBefore = commandContainer.getCommandStream()->getUsed(); - result = commandList->appendMemoryFill(immediateDstPtr, &immediatePattern, - sizeof(immediatePattern), - immediateAllocSize, event->toHandle(), 0, nullptr); - EXPECT_EQ(ZE_RESULT_SUCCESS, result); - size_t usedAfter = commandContainer.getCommandStream()->getUsed(); + auto commandList = std::make_unique>>(); + commandList->initialize(device, NEO::EngineGroupType::RenderCompute, 0u); + auto &commandContainer = commandList->commandContainer; - EXPECT_EQ(2u, event->getPacketsInUse()); - EXPECT_EQ(2u, event->getKernelCount()); + size_t usedBefore = commandContainer.getCommandStream()->getUsed(); + result = commandList->appendMemoryFill(immediateDstPtr, &immediatePattern, + sizeof(immediatePattern), + immediateAllocSize, event->toHandle(), 0, nullptr); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + size_t usedAfter = commandContainer.getCommandStream()->getUsed(); - GenCmdList cmdList; - ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( - cmdList, - ptrOffset(commandContainer.getCommandStream()->getCpuBase(), usedBefore), - usedAfter - usedBefore)); + EXPECT_EQ(expectedPacketsInUse, event->getPacketsInUse()); + EXPECT_EQ(expectedKernelCount, event->getKernelCount()); - auto itorWalkers = findAll(cmdList.begin(), cmdList.end()); - ASSERT_EQ(2u, itorWalkers.size()); - auto firstWalker = itorWalkers[0]; - auto secondWalker = itorWalkers[1]; + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, + ptrOffset(commandContainer.getCommandStream()->getCpuBase(), usedBefore), + usedAfter - usedBefore)); - auto walkerCmd = genCmdCast(*firstWalker); - EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation()); - EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); + auto itorWalkers = findAll(cmdList.begin(), cmdList.end()); + ASSERT_EQ(2u, itorWalkers.size()); + auto firstWalker = itorWalkers[0]; + auto secondWalker = itorWalkers[1]; - walkerCmd = genCmdCast(*secondWalker); - EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation()); - EXPECT_EQ(secondKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); + auto walkerCmd = genCmdCast(*firstWalker); + EXPECT_EQ(static_cast(expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation()); + EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); + + walkerCmd = genCmdCast(*secondWalker); + EXPECT_EQ(static_cast(expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation()); + EXPECT_EQ(secondKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); + } + + template + void testAppendMemoryFillManyKernels() { + using FamilyType = typename NEO::GfxFamilyMapper::GfxFamily; + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA; + using OPERATION = typename POSTSYNC_DATA::OPERATION; + + ze_event_pool_desc_t eventPoolDesc = {}; + eventPoolDesc.count = 1; + eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + + ze_event_desc_t eventDesc = {}; + eventDesc.index = 0; + + ze_result_t result = ZE_RESULT_SUCCESS; + auto eventPool = std::unique_ptr(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result)); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + auto event = std::unique_ptr(L0::Event::create(eventPool.get(), &eventDesc, device)); + + uint64_t firstKernelEventAddress = postSyncAddressZero ? 0 : event->getGpuAddress(device); + uint64_t secondKernelEventAddress = postSyncAddressZero ? 0 : event->getGpuAddress(device) + event->getSinglePacketSize(); + + auto commandList = std::make_unique>>(); + commandList->initialize(device, NEO::EngineGroupType::RenderCompute, 0u); + auto &commandContainer = commandList->commandContainer; + + size_t usedBefore = commandContainer.getCommandStream()->getUsed(); + result = commandList->appendMemoryFill(dstPtr, pattern, patternSize, allocSize, event->toHandle(), 0, nullptr); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + size_t usedAfter = commandContainer.getCommandStream()->getUsed(); + + EXPECT_EQ(expectedPacketsInUse, event->getPacketsInUse()); + EXPECT_EQ(expectedKernelCount, event->getKernelCount()); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, + ptrOffset(commandContainer.getCommandStream()->getCpuBase(), usedBefore), + usedAfter - usedBefore)); + + auto itorWalkers = findAll(cmdList.begin(), cmdList.end()); + ASSERT_EQ(2u, itorWalkers.size()); + auto firstWalker = itorWalkers[0]; + auto secondWalker = itorWalkers[1]; + + auto walkerCmd = genCmdCast(*firstWalker); + EXPECT_EQ(static_cast(expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation()); + EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); + + walkerCmd = genCmdCast(*secondWalker); + EXPECT_EQ(static_cast(expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation()); + EXPECT_EQ(secondKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); + } + + template + void testAppendMemoryFillSingleKernel() { + using FamilyType = typename NEO::GfxFamilyMapper::GfxFamily; + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA; + using OPERATION = typename POSTSYNC_DATA::OPERATION; + + ze_event_pool_desc_t eventPoolDesc = {}; + eventPoolDesc.count = 1; + eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + + ze_event_desc_t eventDesc = {}; + eventDesc.index = 0; + + ze_result_t result = ZE_RESULT_SUCCESS; + auto eventPool = std::unique_ptr(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result)); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + auto event = std::unique_ptr(L0::Event::create(eventPool.get(), &eventDesc, device)); + + auto commandList = std::make_unique>>(); + commandList->initialize(device, NEO::EngineGroupType::RenderCompute, 0u); + + int pattern = 0; + const size_t size = 1024; + uint8_t array[size] = {}; + + auto &commandContainer = commandList->commandContainer; + size_t usedBefore = commandContainer.getCommandStream()->getUsed(); + result = commandList->appendMemoryFill(array, &pattern, 1, size, event->toHandle(), 0, nullptr); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + size_t usedAfter = commandContainer.getCommandStream()->getUsed(); + + EXPECT_EQ(expectedPacketsInUse, event->getPacketsInUse()); + EXPECT_EQ(expectedKernelCount, event->getKernelCount()); + + uint64_t firstKernelEventAddress = postSyncAddressZero ? 0 : event->getGpuAddress(device); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, + ptrOffset(commandContainer.getCommandStream()->getCpuBase(), usedBefore), + usedAfter - usedBefore)); + + auto itorWalkers = findAll(cmdList.begin(), cmdList.end()); + ASSERT_EQ(1u, itorWalkers.size()); + auto firstWalker = itorWalkers[0]; + + auto walkerCmd = genCmdCast(*firstWalker); + EXPECT_EQ(static_cast(expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation()); + EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); + } + + template + void testAppendMemoryFillSingleKernelAndL3Flush() { + using FamilyType = typename NEO::GfxFamilyMapper::GfxFamily; + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA; + using OPERATION = typename POSTSYNC_DATA::OPERATION; + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION; + + ze_event_pool_desc_t eventPoolDesc = {}; + eventPoolDesc.count = 1; + eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + + ze_event_desc_t eventDesc = {}; + eventDesc.index = 0; + eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST; + + ze_result_t result = ZE_RESULT_SUCCESS; + auto eventPool = std::unique_ptr(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result)); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + auto event = std::unique_ptr(L0::Event::create(eventPool.get(), &eventDesc, device)); + + auto commandList = std::make_unique>>(); + commandList->initialize(device, NEO::EngineGroupType::RenderCompute, 0u); + + int pattern = 0; + const size_t size = 1024; + uint8_t array[size] = {}; + + auto &commandContainer = commandList->commandContainer; + size_t usedBefore = commandContainer.getCommandStream()->getUsed(); + result = commandList->appendMemoryFill(array, &pattern, 1, size, event->toHandle(), 0, nullptr); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + size_t usedAfter = commandContainer.getCommandStream()->getUsed(); + + EXPECT_EQ(expectedPacketsInUse, event->getPacketsInUse()); + EXPECT_EQ(expectedKernelCount, event->getKernelCount()); + + uint64_t firstKernelEventAddress = postSyncAddressZero ? 0 : event->getGpuAddress(device); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, + ptrOffset(commandContainer.getCommandStream()->getCpuBase(), usedBefore), + usedAfter - usedBefore)); + + auto itorWalkers = findAll(cmdList.begin(), cmdList.end()); + ASSERT_EQ(1u, itorWalkers.size()); + auto firstWalker = itorWalkers[0]; + + auto walkerCmd = genCmdCast(*firstWalker); + EXPECT_EQ(static_cast(expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation()); + EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); + + uint64_t l3FlushPostSyncAddress = firstKernelEventAddress + event->getSinglePacketSize(); + if (event->isUsingContextEndOffset()) { + l3FlushPostSyncAddress += event->getContextEndOffset(); + } + + auto itorPipeControls = findAll(firstWalker, cmdList.end()); + + uint32_t postSyncPipeControls = 0; + uint32_t dcFlushFound = 0; + for (auto it : itorPipeControls) { + auto cmd = genCmdCast(*it); + if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) { + postSyncPipeControls++; + EXPECT_EQ(l3FlushPostSyncAddress, NEO::UnitTestHelper::getPipeControlPostSyncAddress(*cmd)); + EXPECT_EQ(Event::STATE_SIGNALED, cmd->getImmediateData()); + } + if (cmd->getDcFlushEnable()) { + dcFlushFound++; + } + } + EXPECT_EQ(expectedPostSyncPipeControls, postSyncPipeControls); + EXPECT_EQ(1u, dcFlushFound); + } + + uint32_t expectedPacketsInUse = 0; + uint32_t expectedKernelCount = 0; + uint32_t expectedWalkerPostSyncOp = 0; + uint32_t expectedPostSyncPipeControls = 0; + bool postSyncAddressZero = false; +}; + +using AppendFillMultiPacketEventTest = Test>; +using AppendFillSinglePacketEventTest = Test>; + +HWTEST2_F(AppendFillMultiPacketEventTest, + givenCallToAppendMemoryFillWithImmediateValueWhenTimestampEventUsesComputeWalkerPostSyncThenSeparateKernelsUsesPostSyncProfiling, + IsAtLeastXeHpCore) { + expectedPacketsInUse = 2; + expectedKernelCount = 2; + expectedWalkerPostSyncOp = 3; + postSyncAddressZero = false; + + testAppendMemoryFillManyImmediateKernels(); } -HWTEST2_F(AppendFillTest, - givenCallToAppendMemoryFillWhenTimestampEventUsesComputeWalkerPostSyncThenSeparateKernelsUsesPostSyncProfiling, IsAtLeastXeHpCore) { - using GfxFamily = typename NEO::GfxFamilyMapper::GfxFamily; - using COMPUTE_WALKER = typename GfxFamily::COMPUTE_WALKER; - using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA; +HWTEST2_F(AppendFillMultiPacketEventTest, + givenCallToAppendMemoryFillWhenTimestampEventUsesComputeWalkerPostSyncThenSeparateKernelsUsesWalkerPostSyncProfiling, + IsAtLeastXeHpCore) { + expectedPacketsInUse = 2; + expectedKernelCount = 2; + expectedWalkerPostSyncOp = 3; + postSyncAddressZero = false; - ze_event_pool_desc_t eventPoolDesc = {}; - eventPoolDesc.count = 1; - eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; - - ze_event_desc_t eventDesc = {}; - eventDesc.index = 0; - - ze_result_t result = ZE_RESULT_SUCCESS; - auto eventPool = std::unique_ptr(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result)); - EXPECT_EQ(ZE_RESULT_SUCCESS, result); - auto event = std::unique_ptr(L0::Event::create(eventPool.get(), &eventDesc, device)); - - uint64_t firstKernelEventAddress = event->getGpuAddress(device); - uint64_t secondKernelEventAddress = event->getGpuAddress(device) + event->getSinglePacketSize(); - - auto commandList = std::make_unique>>(); - commandList->initialize(device, NEO::EngineGroupType::RenderCompute, 0u); - auto &commandContainer = commandList->commandContainer; - - size_t usedBefore = commandContainer.getCommandStream()->getUsed(); - result = commandList->appendMemoryFill(dstPtr, pattern, patternSize, allocSize, event->toHandle(), 0, nullptr); - EXPECT_EQ(ZE_RESULT_SUCCESS, result); - size_t usedAfter = commandContainer.getCommandStream()->getUsed(); - - EXPECT_EQ(2u, event->getPacketsInUse()); - EXPECT_EQ(2u, event->getKernelCount()); - - GenCmdList cmdList; - ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( - cmdList, - ptrOffset(commandContainer.getCommandStream()->getCpuBase(), usedBefore), - usedAfter - usedBefore)); - - auto itorWalkers = findAll(cmdList.begin(), cmdList.end()); - ASSERT_EQ(2u, itorWalkers.size()); - auto firstWalker = itorWalkers[0]; - auto secondWalker = itorWalkers[1]; - - auto walkerCmd = genCmdCast(*firstWalker); - EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation()); - EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); - - walkerCmd = genCmdCast(*secondWalker); - EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation()); - EXPECT_EQ(secondKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); + testAppendMemoryFillManyKernels(); } -HWTEST2_F(MultiTileAppendFillTest, - givenMultiTileCmdListCallToAppendMemoryFillWhenSignalScopeTimestampEventUsesComputeWalkerPostSyncThenSeparateKernelsUsesPostSyncProfilingAndSingleDcFlushWithPostSync, IsAtLeastXeHpCore) { - using GfxFamily = typename NEO::GfxFamilyMapper::GfxFamily; - using COMPUTE_WALKER = typename GfxFamily::COMPUTE_WALKER; - using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA; - using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; - using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION; +HWTEST2_F(AppendFillMultiPacketEventTest, + givenAppendMemoryFillUsingSinglePacketEventWhenPatternDispatchOneKernelThenUseComputeWalkerPostSync, + IsAtLeastXeHpCore) { + expectedPacketsInUse = 1; + expectedKernelCount = 1; + expectedWalkerPostSyncOp = 3; + postSyncAddressZero = false; - ze_event_pool_desc_t eventPoolDesc = {}; - eventPoolDesc.count = 1; - eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + testAppendMemoryFillSingleKernel(); +} - ze_event_desc_t eventDesc = {}; - eventDesc.index = 0; - eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST; +HWTEST2_F(AppendFillMultiPacketEventTest, + givenAppendMemoryFillUsingSinglePacketEventWhenPatternDispatchOneKernelThenUseComputeWalkerPostSyncAndL3PostSync, + IsXeHpOrXeHpgCore) { + expectedPacketsInUse = 2; + expectedKernelCount = 1; + expectedWalkerPostSyncOp = 3; + expectedPostSyncPipeControls = 1; + postSyncAddressZero = false; - ze_result_t result = ZE_RESULT_SUCCESS; - auto eventPool = std::unique_ptr(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result)); - EXPECT_EQ(ZE_RESULT_SUCCESS, result); - auto event = std::unique_ptr(L0::Event::create(eventPool.get(), &eventDesc, device)); + testAppendMemoryFillSingleKernelAndL3Flush(); +} - uint64_t firstKernelEventAddress = event->getGpuAddress(device); - uint64_t secondKernelEventAddress = event->getGpuAddress(device) + 2 * event->getSinglePacketSize(); +HWTEST2_F(AppendFillSinglePacketEventTest, + givenCallToAppendMemoryFillWithImmediateValueWhenTimestampEventUsesRegisterPostSyncThenSeparateKernelsNotUsesWalkerPostSyncProfiling, + IsAtLeastXeHpCore) { + expectedPacketsInUse = 1; + expectedKernelCount = 1; + expectedWalkerPostSyncOp = 0; + postSyncAddressZero = true; - auto commandList = std::make_unique>>(); - commandList->initialize(device, NEO::EngineGroupType::RenderCompute, 0u); - EXPECT_EQ(2u, commandList->partitionCount); - auto &commandContainer = commandList->commandContainer; + testAppendMemoryFillManyImmediateKernels(); +} - size_t usedBefore = commandContainer.getCommandStream()->getUsed(); - result = commandList->appendMemoryFill(dstPtr, pattern, patternSize, allocSize, event->toHandle(), 0, nullptr); - EXPECT_EQ(ZE_RESULT_SUCCESS, result); - size_t usedAfter = commandContainer.getCommandStream()->getUsed(); +HWTEST2_F(AppendFillSinglePacketEventTest, + givenCallToAppendMemoryFillWhenTimestampEventUsesRegisterPostSyncThenSeparateKernelsNotUsesWalkerPostSyncProfiling, + IsAtLeastXeHpCore) { + expectedPacketsInUse = 1; + expectedKernelCount = 1; + expectedWalkerPostSyncOp = 0; + postSyncAddressZero = true; - // two kernels and each kernel uses two packets (for two tiles), in total 4 - uint32_t expectedPacketsInUse = 4; + testAppendMemoryFillManyKernels(); +} - uint32_t expectedDcFlush = 0; +HWTEST2_F(AppendFillSinglePacketEventTest, + givenAppendMemoryFillUsingSinglePacketEventWhenPatternDispatchOneKernelThenUseComputeWalkerPostSync, + IsAtLeastXeHpCore) { + expectedPacketsInUse = 1; + expectedKernelCount = 1; + expectedWalkerPostSyncOp = 3; + postSyncAddressZero = false; + + testAppendMemoryFillSingleKernel(); +} + +HWTEST2_F(AppendFillSinglePacketEventTest, + givenAppendMemoryFillUsingSinglePacketEventWhenPatternDispatchOneKernelThenUseComputeWalkerPostSyncAndL3PostSync, + IsXeHpOrXeHpgCore) { + expectedPacketsInUse = 2; + expectedKernelCount = 1; + expectedWalkerPostSyncOp = 3; + expectedPostSyncPipeControls = 1; + postSyncAddressZero = false; + + testAppendMemoryFillSingleKernelAndL3Flush(); +} + +template +struct MultiTileAppendFillMultiPacketEventFixture : public MultiTileAppendFillFixture { + void setUp() { + DebugManager.flags.UsePipeControlMultiKernelEventSync.set(usePipeControlMultiPacketEventSync); + MultiTileAppendFillFixture::setUp(); + } + + template + void testAppendMemoryFillManyKernels(ze_event_pool_flags_t eventPoolFlags) { + using FamilyType = typename NEO::GfxFamilyMapper::GfxFamily; + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA; + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION; + using OPERATION = typename POSTSYNC_DATA::OPERATION; + + ze_event_pool_desc_t eventPoolDesc = {}; + eventPoolDesc.count = 1; + eventPoolDesc.flags = eventPoolFlags; + + ze_event_desc_t eventDesc = {}; + eventDesc.index = 0; + eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST; + + ze_result_t result = ZE_RESULT_SUCCESS; + auto eventPool = std::unique_ptr(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result)); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + auto event = std::unique_ptr(L0::Event::create(eventPool.get(), &eventDesc, device)); + + uint64_t firstKernelEventAddress = postSyncAddressZero ? 0 : event->getGpuAddress(device); + uint64_t secondKernelEventAddress = postSyncAddressZero ? 0 : event->getGpuAddress(device) + 2 * event->getSinglePacketSize(); + + auto commandList = std::make_unique>>(); + commandList->initialize(device, NEO::EngineGroupType::RenderCompute, 0u); + EXPECT_EQ(2u, commandList->partitionCount); + auto &commandContainer = commandList->commandContainer; + + size_t usedBefore = commandContainer.getCommandStream()->getUsed(); + result = commandList->appendMemoryFill(dstPtr, pattern, patternSize, allocSize, event->toHandle(), 0, nullptr); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + size_t usedAfter = commandContainer.getCommandStream()->getUsed(); + + uint32_t expectedDcFlush = 0; + + if (NEO::MemorySynchronizationCommands::getDcFlushEnable(true, device->getHwInfo())) { + // 1st dc flush after cross-tile sync, 2nd dc flush for signal scope event + expectedDcFlush = 2; + } + + EXPECT_EQ(expectedPacketsInUse, event->getPacketsInUse()); + EXPECT_EQ(expectedKernelCount, event->getKernelCount()); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, + ptrOffset(commandContainer.getCommandStream()->getCpuBase(), usedBefore), + usedAfter - usedBefore)); + + auto itorWalkers = findAll(cmdList.begin(), cmdList.end()); + ASSERT_EQ(2u, itorWalkers.size()); + auto firstWalker = itorWalkers[0]; + auto secondWalker = itorWalkers[1]; + + auto walkerCmd = genCmdCast(*firstWalker); + EXPECT_EQ(static_cast(expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation()); + EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); + + walkerCmd = genCmdCast(*secondWalker); + EXPECT_EQ(static_cast(expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation()); + EXPECT_EQ(secondKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); + + auto itorPipeControls = findAll(secondWalker, cmdList.end()); + + uint32_t postSyncPipeControls = 0; + uint32_t dcFlushFound = 0; + + for (auto it : itorPipeControls) { + auto cmd = genCmdCast(*it); + if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) { + EXPECT_EQ(Event::STATE_SIGNALED, cmd->getImmediateData()); + postSyncPipeControls++; + } + if (cmd->getDcFlushEnable()) { + dcFlushFound++; + } + } + + EXPECT_EQ(expectedPostSyncPipeControl, postSyncPipeControls); + EXPECT_EQ(expectedDcFlush, dcFlushFound); + } + + template + void testAppendMemoryFillSingleKernelAndL3Flush(ze_event_pool_flags_t eventPoolFlags) { + using FamilyType = typename NEO::GfxFamilyMapper::GfxFamily; + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA; + using OPERATION = typename POSTSYNC_DATA::OPERATION; + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION; + + ze_event_pool_desc_t eventPoolDesc = {}; + eventPoolDesc.count = 1; + eventPoolDesc.flags = eventPoolFlags; + + ze_event_desc_t eventDesc = {}; + eventDesc.index = 0; + eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST; + + ze_result_t result = ZE_RESULT_SUCCESS; + auto eventPool = std::unique_ptr(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result)); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + auto event = std::unique_ptr(L0::Event::create(eventPool.get(), &eventDesc, device)); + + auto commandList = std::make_unique>>(); + commandList->initialize(device, NEO::EngineGroupType::RenderCompute, 0u); + + int pattern = 0; + const size_t size = 1024; + uint8_t array[size] = {}; + + auto &commandContainer = commandList->commandContainer; + size_t usedBefore = commandContainer.getCommandStream()->getUsed(); + result = commandList->appendMemoryFill(array, &pattern, 1, size, event->toHandle(), 0, nullptr); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + size_t usedAfter = commandContainer.getCommandStream()->getUsed(); + + EXPECT_EQ(expectedPacketsInUse, event->getPacketsInUse()); + EXPECT_EQ(expectedKernelCount, event->getKernelCount()); + + uint64_t firstKernelEventAddress = postSyncAddressZero ? 0 : event->getGpuAddress(device); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, + ptrOffset(commandContainer.getCommandStream()->getCpuBase(), usedBefore), + usedAfter - usedBefore)); + + auto itorWalkers = findAll(cmdList.begin(), cmdList.end()); + ASSERT_EQ(1u, itorWalkers.size()); + auto firstWalker = itorWalkers[0]; + + auto walkerCmd = genCmdCast(*firstWalker); + EXPECT_EQ(static_cast(expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation()); + EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); + + uint64_t l3FlushPostSyncAddress = firstKernelEventAddress + 2 * event->getSinglePacketSize(); + if (event->isUsingContextEndOffset()) { + l3FlushPostSyncAddress += event->getContextEndOffset(); + } + + auto itorPipeControls = findAll(firstWalker, cmdList.end()); + + uint32_t postSyncPipeControls = 0; + uint32_t dcFlushFound = 0; + for (auto it : itorPipeControls) { + auto cmd = genCmdCast(*it); + if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) { + postSyncPipeControls++; + EXPECT_EQ(l3FlushPostSyncAddress, NEO::UnitTestHelper::getPipeControlPostSyncAddress(*cmd)); + EXPECT_EQ(Event::STATE_SIGNALED, cmd->getImmediateData()); + } + if (cmd->getDcFlushEnable()) { + dcFlushFound++; + } + } + + constexpr uint32_t expectedDcFlush = 2; // dc flush for last cross-tile sync and separately for signal scope event after last kernel split + EXPECT_EQ(expectedPostSyncPipeControl, postSyncPipeControls); + EXPECT_EQ(expectedDcFlush, dcFlushFound); + } + + uint32_t expectedPacketsInUse = 0; + uint32_t expectedKernelCount = 0; + uint32_t expectedWalkerPostSyncOp = 0; uint32_t expectedPostSyncPipeControl = 0; + bool postSyncAddressZero = false; +}; - if (NEO::MemorySynchronizationCommands::getDcFlushEnable(true, device->getHwInfo())) { - //laster kernel uses 4 packets, in addition to kernel two packets, use 2 packets to two tile cache flush +using MultiTileAppendFillEventMultiPacketTest = Test>; +using MultiTileAppendFillEventSinglePacketTest = Test>; + +HWTEST2_F(MultiTileAppendFillEventMultiPacketTest, + givenMultiTileCmdListCallToAppendMemoryFillWhenSignalScopeTimestampEventUsesComputeWalkerPostSyncThenSeparateKernelsUsesWalkerPostSyncProfilingAndSingleDcFlushWithImmediatePostSync, IsAtLeastXeHpCore) { + // two kernels and each kernel uses two packets (for two tiles), in total 4 + expectedPacketsInUse = 4; + expectedKernelCount = 2; + expectedWalkerPostSyncOp = 3; + expectedPostSyncPipeControl = 0; + if (NEO::MemorySynchronizationCommands::getDcFlushEnable(true, device->getHwInfo())) { + // last kernel uses 4 packets, in addition to kernel two packets, use 2 packets to two tile cache flush expectedPacketsInUse = 6; - // 1st dc flush after cross-tile sync, 2nd dc flush for signal scope event - expectedDcFlush = 2; - //cache flush with event signal + // cache flush with event signal expectedPostSyncPipeControl = 1; } + postSyncAddressZero = false; + testAppendMemoryFillManyKernels(ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP); +} - EXPECT_EQ(expectedPacketsInUse, event->getPacketsInUse()); - EXPECT_EQ(2u, event->getKernelCount()); - - GenCmdList cmdList; - ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( - cmdList, - ptrOffset(commandContainer.getCommandStream()->getCpuBase(), usedBefore), - usedAfter - usedBefore)); - - auto itorWalkers = findAll(cmdList.begin(), cmdList.end()); - ASSERT_EQ(2u, itorWalkers.size()); - auto firstWalker = itorWalkers[0]; - auto secondWalker = itorWalkers[1]; - - auto walkerCmd = genCmdCast(*firstWalker); - EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation()); - EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); - - walkerCmd = genCmdCast(*secondWalker); - EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation()); - EXPECT_EQ(secondKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); - - auto itorPipeControls = findAll(secondWalker, cmdList.end()); - - uint32_t postSyncPipeControls = 0; - uint32_t dcFlushFound = 0; - - for (auto it : itorPipeControls) { - auto cmd = genCmdCast(*it); - if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) { - EXPECT_EQ(Event::STATE_SIGNALED, cmd->getImmediateData()); - postSyncPipeControls++; - } - if (cmd->getDcFlushEnable()) { - dcFlushFound++; - } +HWTEST2_F(MultiTileAppendFillEventMultiPacketTest, + givenMultiTileCmdListCallToAppendMemoryFillWhenSignalScopeImmediateEventUsesComputeWalkerPostSyncThenSeparateKernelsUsesWalkerPostSyncAndSingleDcFlushWithPostSync, IsAtLeastXeHpCore) { + // two kernels and each kernel uses two packets (for two tiles), in total 4 + expectedPacketsInUse = 4; + expectedKernelCount = 2; + expectedWalkerPostSyncOp = 3; + expectedPostSyncPipeControl = 0; + if (NEO::MemorySynchronizationCommands::getDcFlushEnable(true, device->getHwInfo())) { + // last kernel uses 4 packets, in addition to kernel two packets, use 2 packets to two tile cache flush + expectedPacketsInUse = 6; + // cache flush with event signal + expectedPostSyncPipeControl = 1; } + postSyncAddressZero = false; + testAppendMemoryFillManyKernels(0); +} - EXPECT_EQ(expectedPostSyncPipeControl, postSyncPipeControls); - EXPECT_EQ(expectedDcFlush, dcFlushFound); +HWTEST2_F(MultiTileAppendFillEventMultiPacketTest, + givenMultiTileCmdListCallToAppendMemoryFillWhenSignalScopeTimestampEventUsesComputeWalkerPostSyncThenSingleKernelsUsesWalkerPostSyncProfilingAndSingleDcFlushWithImmediatePostSync, IsXeHpOrXeHpgCore) { + // kernel uses 4 packets, in addition to kernel two packets, use 2 packets to two tile cache flush + expectedPacketsInUse = 4; + expectedKernelCount = 1; + expectedWalkerPostSyncOp = 3; + // cache flush with event signal + expectedPostSyncPipeControl = 1; + postSyncAddressZero = false; + + testAppendMemoryFillSingleKernelAndL3Flush(ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP); +} + +HWTEST2_F(MultiTileAppendFillEventMultiPacketTest, + givenMultiTileCmdListCallToAppendMemoryFillWhenSignalScopeImmediateEventUsesComputeWalkerPostSyncThenSingleKernelUsesWalkerPostSyncAndSingleDcFlushWithPostSync, IsXeHpOrXeHpgCore) { + // kernel uses 4 packets, in addition to kernel two packets, use 2 packets to two tile cache flush + expectedPacketsInUse = 4; + expectedKernelCount = 1; + expectedWalkerPostSyncOp = 3; + // cache flush with event signal + expectedPostSyncPipeControl = 1; + + testAppendMemoryFillSingleKernelAndL3Flush(0); +} + +HWTEST2_F(MultiTileAppendFillEventSinglePacketTest, + givenMultiTileCmdListCallToAppendMemoryFillWhenSignalScopeTimestampEventUsesRegisterPostSyncThenSeparateKernelsNotUsesWalkerPostSyncProfilingAndDcFlushWithNoPostSync, IsAtLeastXeHpCore) { + expectedPacketsInUse = 2; + expectedKernelCount = 1; + expectedWalkerPostSyncOp = 0; + expectedPostSyncPipeControl = 0; + postSyncAddressZero = true; + testAppendMemoryFillManyKernels(ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP); +} + +HWTEST2_F(MultiTileAppendFillEventSinglePacketTest, + givenMultiTileCmdListCallToAppendMemoryFillWhenSignalScopeImmediateEventUsesPipeControlPostSyncThenSeparateKernelsNotUsesWalkerPostSyncProfilingAndDcFlushWithImmediatePostSync, IsAtLeastXeHpCore) { + expectedPacketsInUse = 2; + expectedKernelCount = 1; + expectedPacketsInUse = 2; + expectedWalkerPostSyncOp = 0; + expectedPostSyncPipeControl = 1; + postSyncAddressZero = true; + testAppendMemoryFillManyKernels(0); } } // namespace ult diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_xehp_and_later.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_xehp_and_later.cpp index cb75154b27..d98c5c368e 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_xehp_and_later.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_xehp_and_later.cpp @@ -13,6 +13,7 @@ #include "shared/test/common/test_macros/hw_test.h" #include "level_zero/core/source/cmdlist/cmdlist_hw.h" +#include "level_zero/core/source/hw_helpers/l0_hw_helper.h" #include "level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.h" #include "level_zero/core/test/unit_tests/fixtures/device_fixture.h" #include "level_zero/core/test/unit_tests/fixtures/module_fixture.h" @@ -295,368 +296,956 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenVariousKernelsAndPatchingDisallowe pCommandList->reset(); } -using AppendMemoryCopyXeHpAndLater = Test; -using MultiTileAppendMemoryCopyXeHpAndLater = Test; +template +struct AppendMemoryCopyXeHpAndLaterFixture : public DeviceFixture { + void setUp() { + DebugManager.flags.UsePipeControlMultiKernelEventSync.set(usePipeControlMultiPacketEventSync); + DeviceFixture::setUp(); + } -HWTEST2_F(AppendMemoryCopyXeHpAndLater, + template + void testAppendMemoryCopyThreeKernels() { + using FamilyType = typename NEO::GfxFamilyMapper::GfxFamily; + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA; + using OPERATION = typename POSTSYNC_DATA::OPERATION; + + MockAppendMemoryCopy commandList; + commandList.appendMemoryCopyKernelWithGACallBase = true; + + commandList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); + void *srcPtr = reinterpret_cast(0x1231); + void *dstPtr = reinterpret_cast(0x200002345); + + ze_event_pool_desc_t eventPoolDesc = {}; + eventPoolDesc.count = 1; + eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + + ze_event_desc_t eventDesc = {}; + eventDesc.index = 0; + + ze_result_t result = ZE_RESULT_SUCCESS; + auto eventPool = std::unique_ptr(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result)); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + auto event = std::unique_ptr(L0::Event::create(eventPool.get(), &eventDesc, device)); + + uint64_t firstKernelEventAddress = postSyncAddressZero ? 0 : event->getGpuAddress(device); + uint64_t secondKernelEventAddress = postSyncAddressZero ? 0 : event->getGpuAddress(device) + event->getSinglePacketSize(); + uint64_t thirdKernelEventAddress = postSyncAddressZero ? 0 : event->getGpuAddress(device) + 2 * event->getSinglePacketSize(); + + commandList.appendMemoryCopy(dstPtr, srcPtr, 0x100002345, event->toHandle(), 0, nullptr); + EXPECT_EQ(3u, commandList.appendMemoryCopyKernelWithGACalled); + EXPECT_EQ(0u, commandList.appendMemoryCopyBlitCalled); + EXPECT_EQ(expectedPacketsInUse, event->getPacketsInUse()); + EXPECT_EQ(expectedKernelCount, event->getKernelCount()); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, ptrOffset(commandList.commandContainer.getCommandStream()->getCpuBase(), 0), + commandList.commandContainer.getCommandStream()->getUsed())); + + auto itorWalkers = findAll(cmdList.begin(), cmdList.end()); + ASSERT_EQ(3u, itorWalkers.size()); + auto firstWalker = itorWalkers[0]; + auto secondWalker = itorWalkers[1]; + auto thirdWalker = itorWalkers[2]; + + auto walkerCmd = genCmdCast(*firstWalker); + EXPECT_EQ(static_cast(expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation()); + EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); + + walkerCmd = genCmdCast(*secondWalker); + EXPECT_EQ(static_cast(expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation()); + EXPECT_EQ(secondKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); + + walkerCmd = genCmdCast(*thirdWalker); + EXPECT_EQ(static_cast(expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation()); + EXPECT_EQ(thirdKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); + } + + template + void testAppendMemoryCopyThreeKernelsAndL3Flush(ze_event_pool_flags_t eventPoolFlags) { + using FamilyType = typename NEO::GfxFamilyMapper::GfxFamily; + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA; + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + using POST_SYNC_OPERATION = typename FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION; + using OPERATION = typename POSTSYNC_DATA::OPERATION; + + MockAppendMemoryCopy commandList; + commandList.appendMemoryCopyKernelWithGACallBase = true; + + commandList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); + void *srcPtr = reinterpret_cast(0x1231); + void *dstPtr = reinterpret_cast(0x200002345); + + ze_event_pool_desc_t eventPoolDesc = {}; + eventPoolDesc.count = 1; + eventPoolDesc.flags = eventPoolFlags; + + ze_event_desc_t eventDesc = {}; + eventDesc.index = 0; + eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST; + + ze_result_t result = ZE_RESULT_SUCCESS; + auto eventPool = std::unique_ptr(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result)); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + auto event = std::unique_ptr(L0::Event::create(eventPool.get(), &eventDesc, device)); + + uint64_t firstKernelEventAddress = postSyncAddressZero ? 0 : event->getGpuAddress(device); + uint64_t secondKernelEventAddress = postSyncAddressZero ? 0 : event->getGpuAddress(device) + event->getSinglePacketSize(); + uint64_t thirdKernelEventAddress = postSyncAddressZero ? 0 : event->getGpuAddress(device) + 2 * event->getSinglePacketSize(); + + commandList.appendMemoryCopy(dstPtr, srcPtr, 0x100002345, event->toHandle(), 0, nullptr); + EXPECT_EQ(3u, commandList.appendMemoryCopyKernelWithGACalled); + EXPECT_EQ(0u, commandList.appendMemoryCopyBlitCalled); + EXPECT_EQ(expectedPacketsInUse, event->getPacketsInUse()); + EXPECT_EQ(expectedKernelCount, event->getKernelCount()); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, ptrOffset(commandList.commandContainer.getCommandStream()->getCpuBase(), 0), + commandList.commandContainer.getCommandStream()->getUsed())); + + auto itorWalkers = findAll(cmdList.begin(), cmdList.end()); + ASSERT_EQ(3u, itorWalkers.size()); + auto firstWalker = itorWalkers[0]; + auto secondWalker = itorWalkers[1]; + auto thirdWalker = itorWalkers[2]; + + auto walkerCmd = genCmdCast(*firstWalker); + EXPECT_EQ(static_cast(expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation()); + EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); + + walkerCmd = genCmdCast(*secondWalker); + EXPECT_EQ(static_cast(expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation()); + EXPECT_EQ(secondKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); + + walkerCmd = genCmdCast(*thirdWalker); + EXPECT_EQ(static_cast(expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation()); + EXPECT_EQ(thirdKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); + + uint64_t l3FlushPostSyncAddress = thirdKernelEventAddress + event->getSinglePacketSize(); + if (usePipeControlMultiPacketEventSync == 1) { + l3FlushPostSyncAddress = event->getGpuAddress(device); + } + if (event->isUsingContextEndOffset()) { + l3FlushPostSyncAddress += event->getContextEndOffset(); + } + + auto itorPipeControls = findAll(firstWalker, cmdList.end()); + + uint32_t postSyncPipeControls = 0; + uint32_t dcFlushFound = 0; + for (auto it : itorPipeControls) { + auto cmd = genCmdCast(*it); + if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) { + postSyncPipeControls++; + EXPECT_EQ(l3FlushPostSyncAddress, NEO::UnitTestHelper::getPipeControlPostSyncAddress(*cmd)); + EXPECT_EQ(Event::STATE_SIGNALED, cmd->getImmediateData()); + } + if (cmd->getDcFlushEnable()) { + dcFlushFound++; + } + } + EXPECT_EQ(expectedPostSyncPipeControls, postSyncPipeControls); + EXPECT_EQ(1u, dcFlushFound); + } + + template + void testAppendMemoryCopySingleKernel() { + using FamilyType = typename NEO::GfxFamilyMapper::GfxFamily; + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA; + using OPERATION = typename POSTSYNC_DATA::OPERATION; + + MockAppendMemoryCopy commandList; + commandList.appendMemoryCopyKernelWithGACallBase = true; + + commandList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); + void *srcPtr = reinterpret_cast(0x1000); + void *dstPtr = reinterpret_cast(0x20000000); + + ze_event_pool_desc_t eventPoolDesc = {}; + eventPoolDesc.count = 1; + eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + + ze_event_desc_t eventDesc = {}; + eventDesc.index = 0; + + ze_result_t result = ZE_RESULT_SUCCESS; + auto eventPool = std::unique_ptr(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result)); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + auto event = std::unique_ptr(L0::Event::create(eventPool.get(), &eventDesc, device)); + + uint64_t firstKernelEventAddress = postSyncAddressZero ? 0 : event->getGpuAddress(device); + + commandList.appendMemoryCopy(dstPtr, srcPtr, 0x10000000, event->toHandle(), 0, nullptr); + EXPECT_EQ(1u, commandList.appendMemoryCopyKernelWithGACalled); + EXPECT_EQ(0u, commandList.appendMemoryCopyBlitCalled); + EXPECT_EQ(expectedPacketsInUse, event->getPacketsInUse()); + EXPECT_EQ(expectedKernelCount, event->getKernelCount()); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, ptrOffset(commandList.commandContainer.getCommandStream()->getCpuBase(), 0), + commandList.commandContainer.getCommandStream()->getUsed())); + + auto itorWalkers = findAll(cmdList.begin(), cmdList.end()); + ASSERT_EQ(1u, itorWalkers.size()); + auto firstWalker = itorWalkers[0]; + + auto walkerCmd = genCmdCast(*firstWalker); + EXPECT_EQ(static_cast(expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation()); + EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); + } + + template + void testAppendMemoryCopySingleKernelAndL3Flush(ze_event_pool_flags_t eventPoolFlags) { + using FamilyType = typename NEO::GfxFamilyMapper::GfxFamily; + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA; + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + using POST_SYNC_OPERATION = typename FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION; + using OPERATION = typename POSTSYNC_DATA::OPERATION; + + MockAppendMemoryCopy commandList; + commandList.appendMemoryCopyKernelWithGACallBase = true; + + commandList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); + void *srcPtr = reinterpret_cast(0x1000); + void *dstPtr = reinterpret_cast(0x200000000); + + ze_event_pool_desc_t eventPoolDesc = {}; + eventPoolDesc.count = 1; + eventPoolDesc.flags = eventPoolFlags; + + ze_event_desc_t eventDesc = {}; + eventDesc.index = 0; + eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST; + + ze_result_t result = ZE_RESULT_SUCCESS; + auto eventPool = std::unique_ptr(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result)); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + auto event = std::unique_ptr(L0::Event::create(eventPool.get(), &eventDesc, device)); + + uint64_t firstKernelEventAddress = postSyncAddressZero ? 0 : event->getGpuAddress(device); + + commandList.appendMemoryCopy(dstPtr, srcPtr, 0x100000000, event->toHandle(), 0, nullptr); + EXPECT_EQ(1u, commandList.appendMemoryCopyKernelWithGACalled); + EXPECT_EQ(0u, commandList.appendMemoryCopyBlitCalled); + EXPECT_EQ(expectedPacketsInUse, event->getPacketsInUse()); + EXPECT_EQ(expectedKernelCount, event->getKernelCount()); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, ptrOffset(commandList.commandContainer.getCommandStream()->getCpuBase(), 0), + commandList.commandContainer.getCommandStream()->getUsed())); + + auto itorWalkers = findAll(cmdList.begin(), cmdList.end()); + ASSERT_EQ(1u, itorWalkers.size()); + auto firstWalker = itorWalkers[0]; + + auto walkerCmd = genCmdCast(*firstWalker); + EXPECT_EQ(static_cast(expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation()); + EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); + + uint64_t l3FlushPostSyncAddress = firstKernelEventAddress + event->getSinglePacketSize(); + if (event->isUsingContextEndOffset()) { + l3FlushPostSyncAddress += event->getContextEndOffset(); + } + + auto itorPipeControls = findAll(firstWalker, cmdList.end()); + + uint32_t postSyncPipeControls = 0; + uint32_t dcFlushFound = 0; + for (auto it : itorPipeControls) { + auto cmd = genCmdCast(*it); + if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) { + postSyncPipeControls++; + EXPECT_EQ(l3FlushPostSyncAddress, NEO::UnitTestHelper::getPipeControlPostSyncAddress(*cmd)); + EXPECT_EQ(Event::STATE_SIGNALED, cmd->getImmediateData()); + } + if (cmd->getDcFlushEnable()) { + dcFlushFound++; + } + } + EXPECT_EQ(expectedPostSyncPipeControls, postSyncPipeControls); + EXPECT_EQ(1u, dcFlushFound); + } + + template + void testAppendMemoryCopySignalScopeEventToSubDevice() { + using FamilyType = typename NEO::GfxFamilyMapper::GfxFamily; + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION; + + ze_result_t result = ZE_RESULT_SUCCESS; + std::unique_ptr commandList(CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, 0u, result)); + auto &commandContainer = commandList->commandContainer; + + void *srcPtr = reinterpret_cast(0x1234); + void *dstPtr = reinterpret_cast(0x2345); + + ze_event_pool_desc_t eventPoolDesc = {}; + eventPoolDesc.count = 1; + auto eventPool = std::unique_ptr(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result)); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + ze_event_desc_t eventDesc = {}; + eventDesc.index = 0; + eventDesc.signal = ZE_EVENT_SCOPE_FLAG_SUBDEVICE; + auto event = std::unique_ptr(L0::Event::create(eventPool.get(), &eventDesc, device)); + + size_t usedBefore = commandContainer.getCommandStream()->getUsed(); + result = commandList->appendMemoryCopy(dstPtr, srcPtr, 0x1001, event.get(), 0u, nullptr); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + size_t usedAfter = commandContainer.getCommandStream()->getUsed(); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, + ptrOffset(commandContainer.getCommandStream()->getCpuBase(), usedBefore), + usedAfter - usedBefore)); + + auto itorWalker = find(cmdList.begin(), cmdList.end()); + ASSERT_NE(cmdList.end(), itorWalker); + + auto pipeControls = findAll(itorWalker, cmdList.end()); + uint32_t postSyncFound = 0; + uint32_t dcFlushFound = 0; + ASSERT_NE(0u, pipeControls.size()); + for (auto it : pipeControls) { + auto cmd = genCmdCast(*it); + if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA && + cmd->getImmediateData() == Event::STATE_SIGNALED) { + postSyncFound++; + } + if (cmd->getDcFlushEnable()) { + dcFlushFound++; + } + } + + constexpr uint32_t expectedDcFlushFound = 1u; + + EXPECT_EQ(1u, postSyncFound); + EXPECT_EQ(expectedDcFlushFound, dcFlushFound); + } + + DebugManagerStateRestore restorer; + uint32_t expectedPacketsInUse = 0; + uint32_t expectedKernelCount = 0; + uint32_t expectedWalkerPostSyncOp = 0; + uint32_t expectedPostSyncPipeControls = 0; + bool postSyncAddressZero = false; +}; + +using AppendMemoryCopyXeHpAndLaterMultiPacket = Test>; + +HWTEST2_F(AppendMemoryCopyXeHpAndLaterMultiPacket, givenCommandListWhenTimestampProvidedByComputeWalkerPostSyncPassedToMemoryCopyThenAppendProfilingCalledForThreeSeparateKernels, IsAtLeastXeHpCore) { - using GfxFamily = typename NEO::GfxFamilyMapper::GfxFamily; - using COMPUTE_WALKER = typename GfxFamily::COMPUTE_WALKER; - using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA; + expectedPacketsInUse = 3; + expectedKernelCount = 3; + expectedWalkerPostSyncOp = 3; + postSyncAddressZero = false; - MockAppendMemoryCopy commandList; - commandList.appendMemoryCopyKernelWithGACallBase = true; - - commandList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); - void *srcPtr = reinterpret_cast(0x1231); - void *dstPtr = reinterpret_cast(0x200002345); - - ze_event_pool_desc_t eventPoolDesc = {}; - eventPoolDesc.count = 1; - eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; - - ze_event_desc_t eventDesc = {}; - eventDesc.index = 0; - - ze_result_t result = ZE_RESULT_SUCCESS; - auto eventPool = std::unique_ptr(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result)); - EXPECT_EQ(ZE_RESULT_SUCCESS, result); - auto event = std::unique_ptr(L0::Event::create(eventPool.get(), &eventDesc, device)); - - uint64_t firstKernelEventAddress = event->getGpuAddress(device); - uint64_t secondKernelEventAddress = event->getGpuAddress(device) + event->getSinglePacketSize(); - uint64_t thirdKernelEventAddress = event->getGpuAddress(device) + 2 * event->getSinglePacketSize(); - - commandList.appendMemoryCopy(dstPtr, srcPtr, 0x100002345, event->toHandle(), 0, nullptr); - EXPECT_EQ(3u, commandList.appendMemoryCopyKernelWithGACalled); - EXPECT_EQ(0u, commandList.appendMemoryCopyBlitCalled); - EXPECT_EQ(3u, event->getPacketsInUse()); - EXPECT_EQ(3u, event->getKernelCount()); - - GenCmdList cmdList; - ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( - cmdList, ptrOffset(commandList.commandContainer.getCommandStream()->getCpuBase(), 0), - commandList.commandContainer.getCommandStream()->getUsed())); - - auto itorWalkers = findAll(cmdList.begin(), cmdList.end()); - ASSERT_EQ(3u, itorWalkers.size()); - auto firstWalker = itorWalkers[0]; - auto secondWalker = itorWalkers[1]; - auto thirdWalker = itorWalkers[2]; - - auto walkerCmd = genCmdCast(*firstWalker); - EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation()); - EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); - - walkerCmd = genCmdCast(*secondWalker); - EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation()); - EXPECT_EQ(secondKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); - - walkerCmd = genCmdCast(*thirdWalker); - EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation()); - EXPECT_EQ(thirdKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); + testAppendMemoryCopyThreeKernels(); } -HWTEST2_F(MultiTileAppendMemoryCopyXeHpAndLater, +HWTEST2_F(AppendMemoryCopyXeHpAndLaterMultiPacket, + givenCommandListWhenTimestampProvidedByComputeWalkerPostSyncPassedToMemoryCopyThenAppendProfilingCalledForSingleKernel, + IsAtLeastXeHpCore) { + expectedPacketsInUse = 1; + expectedKernelCount = 1; + expectedWalkerPostSyncOp = 3; + postSyncAddressZero = false; + + testAppendMemoryCopySingleKernel(); +} + +HWTEST2_F(AppendMemoryCopyXeHpAndLaterMultiPacket, + givenCommandListAndTimestampEventWithSignalScopeWhenTimestampProvidedByComputeWalkerPostSyncPassedToMemoryCopyThenAppendProfilingCalledForThreeSeparateKernelsAndL3FlushWithPostSyncAddedOnce, + IsXeHpOrXeHpgCore) { + expectedPacketsInUse = 4; + expectedKernelCount = 3; + expectedWalkerPostSyncOp = 3; + expectedPostSyncPipeControls = 1; + postSyncAddressZero = false; + + testAppendMemoryCopyThreeKernelsAndL3Flush(ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP); +} + +HWTEST2_F(AppendMemoryCopyXeHpAndLaterMultiPacket, + givenCommandListAndEventWithSignalScopeWhenImmediateProvidedByComputeWalkerPostSyncPassedToMemoryCopyThenAppendProfilingCalledForThreeSeparateKernelsAndL3FlushWithPostSyncAddedOnce, + IsXeHpOrXeHpgCore) { + expectedPacketsInUse = 4; + expectedKernelCount = 3; + expectedWalkerPostSyncOp = L0HwHelper::get(gfxCoreFamily).multiTileCapablePlatform() ? 3 : 1; + expectedPostSyncPipeControls = 1; + postSyncAddressZero = false; + + testAppendMemoryCopyThreeKernelsAndL3Flush(0); +} + +HWTEST2_F(AppendMemoryCopyXeHpAndLaterMultiPacket, + givenCommandListAndTimestampEventWithSignalScopeWhenTimestampProvidedByComputeWalkerPostSyncPassedToMemoryCopyThenAppendProfilingCalledForSingleSeparateKernelAndL3FlushWithPostSyncAddedOnce, + IsXeHpOrXeHpgCore) { + expectedPacketsInUse = 2; + expectedKernelCount = 1; + expectedWalkerPostSyncOp = 3; + expectedPostSyncPipeControls = 1; + postSyncAddressZero = false; + + testAppendMemoryCopySingleKernelAndL3Flush(ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP); +} + +HWTEST2_F(AppendMemoryCopyXeHpAndLaterMultiPacket, + givenCommandListAndEventWithSignalScopeWhenImmediateProvidedByComputeWalkerPostSyncPassedToMemoryCopyThenAppendProfilingCalledForSingleSeparateKernelAndL3FlushWithPostSyncAddedOnce, + IsXeHpOrXeHpgCore) { + expectedPacketsInUse = 2; + expectedKernelCount = 1; + expectedWalkerPostSyncOp = L0HwHelper::get(gfxCoreFamily).multiTileCapablePlatform() ? 3 : 1; + expectedPostSyncPipeControls = 1; + postSyncAddressZero = false; + + testAppendMemoryCopySingleKernelAndL3Flush(0); +} + +HWTEST2_F(AppendMemoryCopyXeHpAndLaterMultiPacket, + givenCommandListWhenMemoryCopyWithSignalEventScopeSetToSubDeviceThenB2BPipeControlIsAddedWithDcFlushWithPostSyncForLastPC, IsXeHpOrXeHpgCore) { + testAppendMemoryCopySignalScopeEventToSubDevice(); +} + +using AppendMemoryCopyXeHpAndLaterSinglePacket = Test>; + +HWTEST2_F(AppendMemoryCopyXeHpAndLaterSinglePacket, + givenCommandListWhenTimestampProvidedByRegisterPostSyncPassedToMemoryCopyThenAppendProfilingCalledForRegisterOnly, + IsAtLeastXeHpCore) { + expectedPacketsInUse = 1; + expectedKernelCount = 1; + expectedWalkerPostSyncOp = 0; + postSyncAddressZero = true; + + testAppendMemoryCopyThreeKernels(); +} + +HWTEST2_F(AppendMemoryCopyXeHpAndLaterSinglePacket, + givenCommandListWhenTimestampProvidedByComputeWalkerPostSyncPassedToMemoryCopyThenAppendProfilingCalledForSingleKernel, + IsAtLeastXeHpCore) { + expectedPacketsInUse = 1; + expectedKernelCount = 1; + expectedWalkerPostSyncOp = 3; + postSyncAddressZero = false; + + testAppendMemoryCopySingleKernel(); +} + +HWTEST2_F(AppendMemoryCopyXeHpAndLaterSinglePacket, + givenCommandListAndTimestampEventWithSignalScopeWhenTimestampProvidedByRegisterPostSyncPassedToMemoryCopyThenAppendProfilingCalledForRegisterAndL3FlushWithNoPostSyncAddedOnce, + IsXeHpOrXeHpgCore) { + expectedPacketsInUse = 1; + expectedKernelCount = 1; + expectedWalkerPostSyncOp = 0; + expectedPostSyncPipeControls = 0; + postSyncAddressZero = true; + + testAppendMemoryCopyThreeKernelsAndL3Flush(ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP); +} + +HWTEST2_F(AppendMemoryCopyXeHpAndLaterSinglePacket, + givenCommandListAndEventWithSignalScopeWhenImmediateProvidedByPipeControlPostSyncPassedToMemoryCopyThenEventProfilingCalledForPipeControlAndL3FlushWithPostSyncAddedOnce, + IsXeHpOrXeHpgCore) { + expectedPacketsInUse = 1; + expectedKernelCount = 1; + expectedWalkerPostSyncOp = 0; + expectedPostSyncPipeControls = 1; + postSyncAddressZero = true; + + testAppendMemoryCopyThreeKernelsAndL3Flush(0); +} + +HWTEST2_F(AppendMemoryCopyXeHpAndLaterSinglePacket, + givenCommandListAndTimestampEventWithSignalScopeWhenTimestampProvidedByComputeWalkerPostSyncPassedToMemoryCopyThenAppendProfilingCalledForSingleSeparateKernelAndL3FlushWithPostSyncAddedOnce, + IsXeHpOrXeHpgCore) { + expectedPacketsInUse = 2; + expectedKernelCount = 1; + expectedWalkerPostSyncOp = 3; + expectedPostSyncPipeControls = 1; + postSyncAddressZero = false; + + testAppendMemoryCopySingleKernelAndL3Flush(ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP); +} + +HWTEST2_F(AppendMemoryCopyXeHpAndLaterSinglePacket, + givenCommandListAndEventWithSignalScopeWhenImmediateProvidedByComputeWalkerPostSyncPassedToMemoryCopyThenAppendProfilingCalledForSingleSeparateKernelAndL3FlushWithPostSyncAddedOnce, + IsXeHpOrXeHpgCore) { + expectedPacketsInUse = 2; + expectedKernelCount = 1; + expectedWalkerPostSyncOp = L0HwHelper::get(gfxCoreFamily).multiTileCapablePlatform() ? 3 : 1; + expectedPostSyncPipeControls = 1; + postSyncAddressZero = false; + + testAppendMemoryCopySingleKernelAndL3Flush(0); +} + +HWTEST2_F(AppendMemoryCopyXeHpAndLaterSinglePacket, + givenCommandListWhenMemoryCopyWithSignalEventScopeSetToSubDeviceThenB2BPipeControlIsAddedWithDcFlushWithPostSyncForLastPC, IsXeHpOrXeHpgCore) { + testAppendMemoryCopySignalScopeEventToSubDevice(); +} + +template +struct MultiTileAppendMemoryCopyXeHpAndLaterFixture : public ImplicitScalingRootDevice { + void setUp() { + DebugManager.flags.UsePipeControlMultiKernelEventSync.set(usePipeControlMultiPacketEventSync); + ImplicitScalingRootDevice::setUp(); + } + + template + void testAppendMemoryCopyThreeKernels() { + using FamilyType = typename NEO::GfxFamilyMapper::GfxFamily; + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA; + using OPERATION = typename POSTSYNC_DATA::OPERATION; + + MockAppendMemoryCopy commandList; + commandList.appendMemoryCopyKernelWithGACallBase = true; + + commandList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); + EXPECT_EQ(2u, commandList.partitionCount); + void *srcPtr = reinterpret_cast(0x1231); + void *dstPtr = reinterpret_cast(0x200002345); + + ze_event_pool_desc_t eventPoolDesc = {}; + eventPoolDesc.count = 1; + eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + + ze_event_desc_t eventDesc = {}; + eventDesc.index = 0; + + ze_result_t result = ZE_RESULT_SUCCESS; + auto eventPool = std::unique_ptr(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result)); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + auto event = std::unique_ptr(L0::Event::create(eventPool.get(), &eventDesc, device)); + + uint64_t firstKernelEventAddress = postSyncAddressZero ? 0 : event->getGpuAddress(device); + uint64_t secondKernelEventAddress = postSyncAddressZero ? 0 : event->getGpuAddress(device) + 2 * event->getSinglePacketSize(); + uint64_t thirdKernelEventAddress = postSyncAddressZero ? 0 : event->getGpuAddress(device) + 4 * event->getSinglePacketSize(); + + commandList.appendMemoryCopy(dstPtr, srcPtr, 0x100002345, event->toHandle(), 0, nullptr); + EXPECT_EQ(3u, commandList.appendMemoryCopyKernelWithGACalled); + EXPECT_EQ(0u, commandList.appendMemoryCopyBlitCalled); + EXPECT_EQ(expectedPacketsInUse, event->getPacketsInUse()); + EXPECT_EQ(expectedKernelCount, event->getKernelCount()); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, ptrOffset(commandList.commandContainer.getCommandStream()->getCpuBase(), 0), + commandList.commandContainer.getCommandStream()->getUsed())); + + auto itorWalkers = findAll(cmdList.begin(), cmdList.end()); + ASSERT_EQ(3u, itorWalkers.size()); + auto firstWalker = itorWalkers[0]; + auto secondWalker = itorWalkers[1]; + auto thirdWalker = itorWalkers[2]; + + auto walkerCmd = genCmdCast(*firstWalker); + EXPECT_EQ(static_cast(expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation()); + EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); + + walkerCmd = genCmdCast(*secondWalker); + EXPECT_EQ(static_cast(expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation()); + EXPECT_EQ(secondKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); + + walkerCmd = genCmdCast(*thirdWalker); + EXPECT_EQ(static_cast(expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation()); + EXPECT_EQ(thirdKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); + } + + template + void testAppendMemoryCopyThreeKernelsAndL3Flush(ze_event_pool_flags_t eventPoolFlags) { + using FamilyType = typename NEO::GfxFamilyMapper::GfxFamily; + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA; + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION; + using OPERATION = typename POSTSYNC_DATA::OPERATION; + + MockAppendMemoryCopy commandList; + commandList.appendMemoryCopyKernelWithGACallBase = true; + + commandList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); + EXPECT_EQ(2u, commandList.partitionCount); + auto &commandContainer = commandList.commandContainer; + + void *srcPtr = reinterpret_cast(0x1231); + void *dstPtr = reinterpret_cast(0x200002345); + + ze_event_pool_desc_t eventPoolDesc = {}; + eventPoolDesc.count = 1; + eventPoolDesc.flags = eventPoolFlags; + + ze_event_desc_t eventDesc = {}; + eventDesc.index = 0; + eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST; + + ze_result_t result = ZE_RESULT_SUCCESS; + auto eventPool = std::unique_ptr(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result)); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + auto event = std::unique_ptr(L0::Event::create(eventPool.get(), &eventDesc, device)); + + uint64_t firstKernelEventAddress = postSyncAddressZero ? 0 : event->getGpuAddress(device); + uint64_t secondKernelEventAddress = postSyncAddressZero ? 0 : event->getGpuAddress(device) + 2 * event->getSinglePacketSize(); + uint64_t thirdKernelEventAddress = postSyncAddressZero ? 0 : event->getGpuAddress(device) + 4 * event->getSinglePacketSize(); + + size_t usedBefore = commandContainer.getCommandStream()->getUsed(); + commandList.appendMemoryCopy(dstPtr, srcPtr, 0x100002345, event->toHandle(), 0, nullptr); + size_t usedAfter = commandContainer.getCommandStream()->getUsed(); + + EXPECT_EQ(3u, commandList.appendMemoryCopyKernelWithGACalled); + EXPECT_EQ(0u, commandList.appendMemoryCopyBlitCalled); + EXPECT_EQ(expectedPacketsInUse, event->getPacketsInUse()); + EXPECT_EQ(expectedKernelCount, event->getKernelCount()); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, + ptrOffset(commandContainer.getCommandStream()->getCpuBase(), usedBefore), + usedAfter - usedBefore)); + + auto itorWalkers = findAll(cmdList.begin(), cmdList.end()); + ASSERT_EQ(3u, itorWalkers.size()); + auto firstWalker = itorWalkers[0]; + auto secondWalker = itorWalkers[1]; + auto thirdWalker = itorWalkers[2]; + + auto walkerCmd = genCmdCast(*firstWalker); + EXPECT_EQ(static_cast(expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation()); + EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); + + walkerCmd = genCmdCast(*secondWalker); + EXPECT_EQ(static_cast(expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation()); + EXPECT_EQ(secondKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); + + walkerCmd = genCmdCast(*thirdWalker); + EXPECT_EQ(static_cast(expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation()); + EXPECT_EQ(thirdKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); + + uint64_t l3FlushPostSyncAddress = thirdKernelEventAddress + 2 * event->getSinglePacketSize(); + if (usePipeControlMultiPacketEventSync == 1) { + l3FlushPostSyncAddress = event->getGpuAddress(device); + } + if (event->isUsingContextEndOffset()) { + l3FlushPostSyncAddress += event->getContextEndOffset(); + } + + auto itorPipeControls = findAll(thirdWalker, cmdList.end()); + + uint32_t postSyncPipeControls = 0; + uint32_t dcFlushFound = 0; + + for (auto it : itorPipeControls) { + auto cmd = genCmdCast(*it); + if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) { + EXPECT_EQ(l3FlushPostSyncAddress, NEO::UnitTestHelper::getPipeControlPostSyncAddress(*cmd)); + EXPECT_EQ(Event::STATE_SIGNALED, cmd->getImmediateData()); + EXPECT_TRUE(cmd->getWorkloadPartitionIdOffsetEnable()); + postSyncPipeControls++; + } + if (cmd->getDcFlushEnable()) { + dcFlushFound++; + } + } + + constexpr uint32_t expectedDcFlush = 2; // dc flush for last cross-tile sync and separately for signal scope event after last kernel split + EXPECT_EQ(expectedPostSyncPipeControls, postSyncPipeControls); + EXPECT_EQ(expectedDcFlush, dcFlushFound); + } + + template + void testAppendMemoryCopySingleKernel() { + using FamilyType = typename NEO::GfxFamilyMapper::GfxFamily; + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA; + using OPERATION = typename POSTSYNC_DATA::OPERATION; + + MockAppendMemoryCopy commandList; + commandList.appendMemoryCopyKernelWithGACallBase = true; + + commandList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); + EXPECT_EQ(2u, commandList.partitionCount); + void *srcPtr = reinterpret_cast(0x1000); + void *dstPtr = reinterpret_cast(0x20000000); + + ze_event_pool_desc_t eventPoolDesc = {}; + eventPoolDesc.count = 1; + eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + + ze_event_desc_t eventDesc = {}; + eventDesc.index = 0; + + ze_result_t result = ZE_RESULT_SUCCESS; + auto eventPool = std::unique_ptr(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result)); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + auto event = std::unique_ptr(L0::Event::create(eventPool.get(), &eventDesc, device)); + + uint64_t firstKernelEventAddress = postSyncAddressZero ? 0 : event->getGpuAddress(device); + + commandList.appendMemoryCopy(dstPtr, srcPtr, 0x100000000, event->toHandle(), 0, nullptr); + EXPECT_EQ(1u, commandList.appendMemoryCopyKernelWithGACalled); + EXPECT_EQ(0u, commandList.appendMemoryCopyBlitCalled); + EXPECT_EQ(expectedPacketsInUse, event->getPacketsInUse()); + EXPECT_EQ(expectedKernelCount, event->getKernelCount()); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, ptrOffset(commandList.commandContainer.getCommandStream()->getCpuBase(), 0), + commandList.commandContainer.getCommandStream()->getUsed())); + + auto itorWalkers = findAll(cmdList.begin(), cmdList.end()); + ASSERT_EQ(1u, itorWalkers.size()); + auto firstWalker = itorWalkers[0]; + + auto walkerCmd = genCmdCast(*firstWalker); + EXPECT_EQ(static_cast(expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation()); + EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); + } + + template + void testAppendMemoryCopySingleKernelAndL3Flush(ze_event_pool_flags_t eventPoolFlags) { + using FamilyType = typename NEO::GfxFamilyMapper::GfxFamily; + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA; + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION; + using OPERATION = typename POSTSYNC_DATA::OPERATION; + + MockAppendMemoryCopy commandList; + commandList.appendMemoryCopyKernelWithGACallBase = true; + + commandList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); + EXPECT_EQ(2u, commandList.partitionCount); + auto &commandContainer = commandList.commandContainer; + + void *srcPtr = reinterpret_cast(0x1000); + void *dstPtr = reinterpret_cast(0x200000000); + + ze_event_pool_desc_t eventPoolDesc = {}; + eventPoolDesc.count = 1; + eventPoolDesc.flags = eventPoolFlags; + + ze_event_desc_t eventDesc = {}; + eventDesc.index = 0; + eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST; + + ze_result_t result = ZE_RESULT_SUCCESS; + auto eventPool = std::unique_ptr(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result)); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + auto event = std::unique_ptr(L0::Event::create(eventPool.get(), &eventDesc, device)); + + uint64_t firstKernelEventAddress = postSyncAddressZero ? 0 : event->getGpuAddress(device); + + size_t usedBefore = commandContainer.getCommandStream()->getUsed(); + commandList.appendMemoryCopy(dstPtr, srcPtr, 0x100000000, event->toHandle(), 0, nullptr); + size_t usedAfter = commandContainer.getCommandStream()->getUsed(); + + EXPECT_EQ(1u, commandList.appendMemoryCopyKernelWithGACalled); + EXPECT_EQ(0u, commandList.appendMemoryCopyBlitCalled); + EXPECT_EQ(expectedPacketsInUse, event->getPacketsInUse()); + EXPECT_EQ(expectedKernelCount, event->getKernelCount()); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, + ptrOffset(commandContainer.getCommandStream()->getCpuBase(), usedBefore), + usedAfter - usedBefore)); + + auto itorWalkers = findAll(cmdList.begin(), cmdList.end()); + ASSERT_EQ(1u, itorWalkers.size()); + auto firstWalker = itorWalkers[0]; + + auto walkerCmd = genCmdCast(*firstWalker); + EXPECT_EQ(static_cast(expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation()); + EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); + + uint64_t l3FlushPostSyncAddress = firstKernelEventAddress + 2 * event->getSinglePacketSize(); + if (event->isUsingContextEndOffset()) { + l3FlushPostSyncAddress += event->getContextEndOffset(); + } + + auto itorPipeControls = findAll(firstWalker, cmdList.end()); + + uint32_t postSyncPipeControls = 0; + uint32_t dcFlushFound = 0; + + for (auto it : itorPipeControls) { + auto cmd = genCmdCast(*it); + if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) { + EXPECT_EQ(l3FlushPostSyncAddress, NEO::UnitTestHelper::getPipeControlPostSyncAddress(*cmd)); + EXPECT_EQ(Event::STATE_SIGNALED, cmd->getImmediateData()); + EXPECT_TRUE(cmd->getWorkloadPartitionIdOffsetEnable()); + postSyncPipeControls++; + } + if (cmd->getDcFlushEnable()) { + dcFlushFound++; + } + } + + constexpr uint32_t expectedDcFlush = 2; // dc flush for last cross-tile sync and separately for signal scope event after last kernel split + EXPECT_EQ(expectedPostSyncPipeControls, postSyncPipeControls); + EXPECT_EQ(expectedDcFlush, dcFlushFound); + } + + DebugManagerStateRestore restorer; + uint32_t expectedPacketsInUse = 0; + uint32_t expectedKernelCount = 0; + uint32_t expectedWalkerPostSyncOp = 0; + uint32_t expectedPostSyncPipeControls = 0; + bool postSyncAddressZero = false; +}; + +using MultiTileAppendMemoryCopyXeHpAndLaterMultiPacket = Test>; + +using MultiTileAppendMemoryCopyXeHpAndLaterSinglePacket = Test>; + +HWTEST2_F(MultiTileAppendMemoryCopyXeHpAndLaterMultiPacket, givenMultiTileCommandListWhenTimestampProvidedByComputeWalkerPostSyncPassedToMemoryCopyThenAppendProfilingCalledForThreeSeparateMultiTileKernels, IsAtLeastXeHpCore) { - using GfxFamily = typename NEO::GfxFamilyMapper::GfxFamily; - using COMPUTE_WALKER = typename GfxFamily::COMPUTE_WALKER; - using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA; + expectedPacketsInUse = 6; + expectedKernelCount = 3; + expectedWalkerPostSyncOp = 3; + postSyncAddressZero = false; - MockAppendMemoryCopy commandList; - commandList.appendMemoryCopyKernelWithGACallBase = true; - - commandList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); - EXPECT_EQ(2u, commandList.partitionCount); - void *srcPtr = reinterpret_cast(0x1231); - void *dstPtr = reinterpret_cast(0x200002345); - - ze_event_pool_desc_t eventPoolDesc = {}; - eventPoolDesc.count = 1; - eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; - - ze_event_desc_t eventDesc = {}; - eventDesc.index = 0; - - ze_result_t result = ZE_RESULT_SUCCESS; - auto eventPool = std::unique_ptr(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result)); - EXPECT_EQ(ZE_RESULT_SUCCESS, result); - auto event = std::unique_ptr(L0::Event::create(eventPool.get(), &eventDesc, device)); - - uint64_t firstKernelEventAddress = event->getGpuAddress(device); - uint64_t secondKernelEventAddress = event->getGpuAddress(device) + 2 * event->getSinglePacketSize(); - uint64_t thirdKernelEventAddress = event->getGpuAddress(device) + 4 * event->getSinglePacketSize(); - - commandList.appendMemoryCopy(dstPtr, srcPtr, 0x100002345, event->toHandle(), 0, nullptr); - EXPECT_EQ(3u, commandList.appendMemoryCopyKernelWithGACalled); - EXPECT_EQ(0u, commandList.appendMemoryCopyBlitCalled); - EXPECT_EQ(6u, event->getPacketsInUse()); - EXPECT_EQ(3u, event->getKernelCount()); - - GenCmdList cmdList; - ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( - cmdList, ptrOffset(commandList.commandContainer.getCommandStream()->getCpuBase(), 0), - commandList.commandContainer.getCommandStream()->getUsed())); - - auto itorWalkers = findAll(cmdList.begin(), cmdList.end()); - ASSERT_EQ(3u, itorWalkers.size()); - auto firstWalker = itorWalkers[0]; - auto secondWalker = itorWalkers[1]; - auto thirdWalker = itorWalkers[2]; - - auto walkerCmd = genCmdCast(*firstWalker); - EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation()); - EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); - - walkerCmd = genCmdCast(*secondWalker); - EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation()); - EXPECT_EQ(secondKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); - - walkerCmd = genCmdCast(*thirdWalker); - EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation()); - EXPECT_EQ(thirdKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); + testAppendMemoryCopyThreeKernels(); } -HWTEST2_F(AppendMemoryCopyXeHpAndLater, - givenCommandListAndEventWithSignalScopeWhenTimestampProvidedByComputeWalkerPostSyncPassedToMemoryCopyThenAppendProfilingCalledForThreeSeparateKernelsAndL3FlushWithPostSyncAddedOnce, +HWTEST2_F(MultiTileAppendMemoryCopyXeHpAndLaterMultiPacket, + givenMultiTileCommandListWhenTimestampProvidedByComputeWalkerPostSyncPassedToMemoryCopyThenAppendProfilingCalledForSingleSeparateMultiTileKernel, + IsAtLeastXeHpCore) { + expectedPacketsInUse = 2; + expectedKernelCount = 1; + expectedWalkerPostSyncOp = 3; + postSyncAddressZero = false; + + testAppendMemoryCopySingleKernel(); +} + +HWTEST2_F(MultiTileAppendMemoryCopyXeHpAndLaterMultiPacket, + givenMultiTileCommandListAndTimestampEventWithSignalScopeWhenTimestampProvidedByComputeWalkerPostSyncPassedToMemoryCopyThenAppendProfilingCalledForThreeSeparateMultiTileKernelsAndL3FlushWithPostSyncAddedForScopedEvent, IsXeHpOrXeHpgCore) { - using GfxFamily = typename NEO::GfxFamilyMapper::GfxFamily; - using COMPUTE_WALKER = typename GfxFamily::COMPUTE_WALKER; - using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA; - using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; - using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION; + expectedPacketsInUse = 8; + expectedKernelCount = 3; + expectedWalkerPostSyncOp = 3; + expectedPostSyncPipeControls = 1; + postSyncAddressZero = false; - MockAppendMemoryCopy commandList; - commandList.appendMemoryCopyKernelWithGACallBase = true; - - commandList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); - void *srcPtr = reinterpret_cast(0x1231); - void *dstPtr = reinterpret_cast(0x200002345); - - ze_event_pool_desc_t eventPoolDesc = {}; - eventPoolDesc.count = 1; - eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; - - ze_event_desc_t eventDesc = {}; - eventDesc.index = 0; - eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST; - - ze_result_t result = ZE_RESULT_SUCCESS; - auto eventPool = std::unique_ptr(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result)); - EXPECT_EQ(ZE_RESULT_SUCCESS, result); - auto event = std::unique_ptr(L0::Event::create(eventPool.get(), &eventDesc, device)); - - uint64_t firstKernelEventAddress = event->getGpuAddress(device); - uint64_t secondKernelEventAddress = event->getGpuAddress(device) + event->getSinglePacketSize(); - uint64_t thirdKernelEventAddress = event->getGpuAddress(device) + 2 * event->getSinglePacketSize(); - - commandList.appendMemoryCopy(dstPtr, srcPtr, 0x100002345, event->toHandle(), 0, nullptr); - EXPECT_EQ(3u, commandList.appendMemoryCopyKernelWithGACalled); - EXPECT_EQ(0u, commandList.appendMemoryCopyBlitCalled); - EXPECT_EQ(4u, event->getPacketsInUse()); - EXPECT_EQ(3u, event->getKernelCount()); - - GenCmdList cmdList; - ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( - cmdList, ptrOffset(commandList.commandContainer.getCommandStream()->getCpuBase(), 0), - commandList.commandContainer.getCommandStream()->getUsed())); - - auto itorWalkers = findAll(cmdList.begin(), cmdList.end()); - ASSERT_EQ(3u, itorWalkers.size()); - auto firstWalker = itorWalkers[0]; - auto secondWalker = itorWalkers[1]; - auto thirdWalker = itorWalkers[2]; - - auto walkerCmd = genCmdCast(*firstWalker); - EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation()); - EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); - - walkerCmd = genCmdCast(*secondWalker); - EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation()); - EXPECT_EQ(secondKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); - - walkerCmd = genCmdCast(*thirdWalker); - EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation()); - EXPECT_EQ(thirdKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); - - uint64_t l3FlushPostSyncAddress = thirdKernelEventAddress + event->getSinglePacketSize(); - if (event->isUsingContextEndOffset()) { - l3FlushPostSyncAddress += event->getContextEndOffset(); - } - - auto itorPipeControls = findAll(firstWalker, cmdList.end()); - - uint32_t postSyncPipeControls = 0; - uint32_t dcFlushFound = 0; - for (auto it : itorPipeControls) { - auto cmd = genCmdCast(*it); - if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) { - postSyncPipeControls++; - EXPECT_EQ(l3FlushPostSyncAddress, NEO::UnitTestHelper::getPipeControlPostSyncAddress(*cmd)); - EXPECT_EQ(Event::STATE_SIGNALED, cmd->getImmediateData()); - } - if (cmd->getDcFlushEnable()) { - dcFlushFound++; - } - } - EXPECT_EQ(1u, postSyncPipeControls); - EXPECT_EQ(1u, dcFlushFound); + testAppendMemoryCopyThreeKernelsAndL3Flush(ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP); } -HWTEST2_F(MultiTileAppendMemoryCopyXeHpAndLater, - givenMultiTileCommandListAndEventWithSignalScopeWhenTimestampProvidedByComputeWalkerPostSyncPassedToMemoryCopyThenAppendProfilingCalledForThreeSeparateMultiTileKernelsAndL3FlushWithPostSyncAddedForScopedEvent, +HWTEST2_F(MultiTileAppendMemoryCopyXeHpAndLaterMultiPacket, + givenMultiTileCommandListAndEventWithSignalScopeWhenImmdiateProvidedByComputeWalkerAndPipeControlPostSyncPassedToMemoryCopyThenAppendProfilingCalledForThreeSeparateMultiTileKernelsAndL3FlushWithPostSyncAddedForScopedEvent, IsXeHpOrXeHpgCore) { - using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; - using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA; - using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; - using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION; + expectedPacketsInUse = 8; + expectedKernelCount = 3; + expectedWalkerPostSyncOp = 3; + expectedPostSyncPipeControls = 1; + postSyncAddressZero = false; - MockAppendMemoryCopy commandList; - commandList.appendMemoryCopyKernelWithGACallBase = true; - - commandList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); - EXPECT_EQ(2u, commandList.partitionCount); - auto &commandContainer = commandList.commandContainer; - - void *srcPtr = reinterpret_cast(0x1231); - void *dstPtr = reinterpret_cast(0x200002345); - - ze_event_pool_desc_t eventPoolDesc = {}; - eventPoolDesc.count = 1; - eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; - - ze_event_desc_t eventDesc = {}; - eventDesc.index = 0; - eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST; - - ze_result_t result = ZE_RESULT_SUCCESS; - auto eventPool = std::unique_ptr(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result)); - EXPECT_EQ(ZE_RESULT_SUCCESS, result); - auto event = std::unique_ptr(L0::Event::create(eventPool.get(), &eventDesc, device)); - - uint64_t firstKernelEventAddress = event->getGpuAddress(device); - uint64_t secondKernelEventAddress = event->getGpuAddress(device) + 2 * event->getSinglePacketSize(); - uint64_t thirdKernelEventAddress = event->getGpuAddress(device) + 4 * event->getSinglePacketSize(); - - size_t usedBefore = commandContainer.getCommandStream()->getUsed(); - commandList.appendMemoryCopy(dstPtr, srcPtr, 0x100002345, event->toHandle(), 0, nullptr); - size_t usedAfter = commandContainer.getCommandStream()->getUsed(); - - EXPECT_EQ(3u, commandList.appendMemoryCopyKernelWithGACalled); - EXPECT_EQ(0u, commandList.appendMemoryCopyBlitCalled); - EXPECT_EQ(8u, event->getPacketsInUse()); - EXPECT_EQ(3u, event->getKernelCount()); - - GenCmdList cmdList; - ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( - cmdList, - ptrOffset(commandContainer.getCommandStream()->getCpuBase(), usedBefore), - usedAfter - usedBefore)); - - auto itorWalkers = findAll(cmdList.begin(), cmdList.end()); - ASSERT_EQ(3u, itorWalkers.size()); - auto firstWalker = itorWalkers[0]; - auto secondWalker = itorWalkers[1]; - auto thirdWalker = itorWalkers[2]; - - auto walkerCmd = genCmdCast(*firstWalker); - EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation()); - EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); - - walkerCmd = genCmdCast(*secondWalker); - EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation()); - EXPECT_EQ(secondKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); - - walkerCmd = genCmdCast(*thirdWalker); - EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation()); - EXPECT_EQ(thirdKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress()); - - uint64_t l3FlushPostSyncAddress = thirdKernelEventAddress + 2 * event->getSinglePacketSize(); - if (event->isUsingContextEndOffset()) { - l3FlushPostSyncAddress += event->getContextEndOffset(); - } - - auto itorPipeControls = findAll(thirdWalker, cmdList.end()); - - uint32_t postSyncPipeControls = 0; - uint32_t dcFlushFound = 0; - - for (auto it : itorPipeControls) { - auto cmd = genCmdCast(*it); - if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) { - EXPECT_EQ(l3FlushPostSyncAddress, NEO::UnitTestHelper::getPipeControlPostSyncAddress(*cmd)); - EXPECT_EQ(Event::STATE_SIGNALED, cmd->getImmediateData()); - EXPECT_TRUE(cmd->getWorkloadPartitionIdOffsetEnable()); - postSyncPipeControls++; - } - if (cmd->getDcFlushEnable()) { - dcFlushFound++; - } - } - - constexpr uint32_t expectedDcFlush = 2; //dc flush for last cross-tile sync and separately for signal scope event after last kernel split - EXPECT_EQ(1u, postSyncPipeControls); - EXPECT_EQ(expectedDcFlush, dcFlushFound); + testAppendMemoryCopyThreeKernelsAndL3Flush(0); } -HWTEST2_F(AppendMemoryCopyXeHpAndLater, - givenCommandListWhenMemoryCopyWithSignalEventScopeSetToSubDeviceThenB2BPipeControlIsAddedWithDcFlushWithPostSyncForLastPC, IsXeHpOrXeHpgCore) { - using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; - using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; - using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION; +HWTEST2_F(MultiTileAppendMemoryCopyXeHpAndLaterMultiPacket, + givenMultiTileCommandListAndTimestampEventWithSignalScopeWhenTimestampProvidedByComputeWalkerPostSyncPassedToMemoryCopyThenAppendProfilingCalledForSingleSeparateMultiTileKernelAndL3FlushWithPostSyncAddedForScopedEvent, + IsXeHpOrXeHpgCore) { + expectedPacketsInUse = 4; + expectedKernelCount = 1; + expectedWalkerPostSyncOp = 3; + expectedPostSyncPipeControls = 1; + postSyncAddressZero = false; - ze_result_t result = ZE_RESULT_SUCCESS; - std::unique_ptr commandList(CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, 0u, result)); - auto &commandContainer = commandList->commandContainer; - - void *srcPtr = reinterpret_cast(0x1234); - void *dstPtr = reinterpret_cast(0x2345); - - ze_event_pool_desc_t eventPoolDesc = {}; - eventPoolDesc.count = 1; - auto eventPool = std::unique_ptr(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result)); - EXPECT_EQ(ZE_RESULT_SUCCESS, result); - ze_event_desc_t eventDesc = {}; - eventDesc.index = 0; - eventDesc.signal = ZE_EVENT_SCOPE_FLAG_SUBDEVICE; - auto event = std::unique_ptr(L0::Event::create(eventPool.get(), &eventDesc, device)); - - size_t usedBefore = commandContainer.getCommandStream()->getUsed(); - result = commandList->appendMemoryCopy(dstPtr, srcPtr, 0x1001, event.get(), 0u, nullptr); - EXPECT_EQ(ZE_RESULT_SUCCESS, result); - size_t usedAfter = commandContainer.getCommandStream()->getUsed(); - - GenCmdList cmdList; - ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( - cmdList, - ptrOffset(commandContainer.getCommandStream()->getCpuBase(), usedBefore), - usedAfter - usedBefore)); - - auto itorWalker = find(cmdList.begin(), cmdList.end()); - ASSERT_NE(cmdList.end(), itorWalker); - - auto pipeControls = findAll(itorWalker, cmdList.end()); - uint32_t postSyncFound = 0; - uint32_t dcFlushFound = 0; - ASSERT_NE(0u, pipeControls.size()); - for (auto it : pipeControls) { - auto cmd = genCmdCast(*it); - if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA && - cmd->getImmediateData() == Event::STATE_SIGNALED) { - postSyncFound++; - } - if (cmd->getDcFlushEnable()) { - dcFlushFound++; - } - } - - constexpr uint32_t expectedDcFlushFound = 1u; - - EXPECT_EQ(1u, postSyncFound); - EXPECT_EQ(expectedDcFlushFound, dcFlushFound); + testAppendMemoryCopySingleKernelAndL3Flush(ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP); } +HWTEST2_F(MultiTileAppendMemoryCopyXeHpAndLaterMultiPacket, + givenMultiTileCommandListAndEventWithSignalScopeWhenImmdiateProvidedByComputeWalkerAndPipeControlPostSyncPassedToMemoryCopyThenAppendProfilingCalledForSingleSeparateMultiTileKernelAndL3FlushWithPostSyncAddedForScopedEvent, + IsXeHpOrXeHpgCore) { + expectedPacketsInUse = 4; + expectedKernelCount = 1; + expectedWalkerPostSyncOp = 3; + expectedPostSyncPipeControls = 1; + postSyncAddressZero = false; + + testAppendMemoryCopySingleKernelAndL3Flush(0); +} + +HWTEST2_F(MultiTileAppendMemoryCopyXeHpAndLaterSinglePacket, + givenMultiTileCommandListWhenTimestampProvidedByRegisterPostSyncPassedToMemoryCopyThenAppendProfilingCalledForMultiTileRegisterPipeControlPacket, + IsAtLeastXeHpCore) { + expectedPacketsInUse = 2; + expectedKernelCount = 1; + expectedWalkerPostSyncOp = 0; + postSyncAddressZero = true; + + testAppendMemoryCopyThreeKernels(); +} + +HWTEST2_F(MultiTileAppendMemoryCopyXeHpAndLaterSinglePacket, + givenMultiTileCommandListWhenTimestampProvidedByComputeWalkerPostSyncPassedToMemoryCopyThenAppendProfilingCalledForSingleSeparateMultiTileKernel, + IsAtLeastXeHpCore) { + expectedPacketsInUse = 2; + expectedKernelCount = 1; + expectedWalkerPostSyncOp = 3; + postSyncAddressZero = false; + + testAppendMemoryCopySingleKernel(); +} + +HWTEST2_F(MultiTileAppendMemoryCopyXeHpAndLaterSinglePacket, + givenMultiTileCommandListAndTimestampEventWithSignalScopeWhenTimestampProvidedByRegisterPostSyncPassedToMemoryCopyThenAppendProfilingCalledForMultiTileRegisterPostSyncAndL3FlushForScopedEvent, + IsXeHpOrXeHpgCore) { + expectedPacketsInUse = 2; + expectedKernelCount = 1; + expectedWalkerPostSyncOp = 0; + expectedPostSyncPipeControls = 0; + postSyncAddressZero = true; + + testAppendMemoryCopyThreeKernelsAndL3Flush(ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP); +} + +HWTEST2_F(MultiTileAppendMemoryCopyXeHpAndLaterSinglePacket, + givenMultiTileCommandListAndEventWithSignalScopeWhenImmediateProvidedByPipeControlPostSyncPassedToMemoryCopyThenAppendProfilingCalledForPipeControlPostSyncAndL3FlushAddedForScopedEvent, + IsXeHpOrXeHpgCore) { + expectedPacketsInUse = 2; + expectedKernelCount = 1; + expectedWalkerPostSyncOp = 0; + expectedPostSyncPipeControls = 1; + postSyncAddressZero = true; + + testAppendMemoryCopyThreeKernelsAndL3Flush(0); +} + +HWTEST2_F(MultiTileAppendMemoryCopyXeHpAndLaterSinglePacket, + givenMultiTileCommandListAndTimestampEventWithSignalScopeWhenTimestampProvidedByComputeWalkerPostSyncPassedToMemoryCopyThenAppendProfilingCalledForSingleKernelPostSync, + IsXeHpOrXeHpgCore) { + expectedPacketsInUse = 4; + expectedKernelCount = 1; + expectedWalkerPostSyncOp = 3; + expectedPostSyncPipeControls = 1; + postSyncAddressZero = false; + + testAppendMemoryCopySingleKernelAndL3Flush(ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP); +} + +HWTEST2_F(MultiTileAppendMemoryCopyXeHpAndLaterSinglePacket, + givenMultiTileCommandListAndEventWithSignalScopeWhenImmediateProvidedByComputeWalkerAndPipeControlPostSyncPassedToMemoryCopyThenAppendProfilingCalledForSingleKernelAndL3FlushPipeControlPostSyncAddedForScopedEvent, + IsXeHpOrXeHpgCore) { + expectedPacketsInUse = 4; + expectedKernelCount = 1; + expectedWalkerPostSyncOp = 3; + expectedPostSyncPipeControls = 1; + postSyncAddressZero = false; + + testAppendMemoryCopySingleKernelAndL3Flush(0); +} } // namespace ult } // namespace L0 diff --git a/level_zero/core/test/unit_tests/sources/helper/l0_hw_helper_tests.cpp b/level_zero/core/test/unit_tests/sources/helper/l0_hw_helper_tests.cpp index e0c9bda1ae..ff11197a2b 100644 --- a/level_zero/core/test/unit_tests/sources/helper/l0_hw_helper_tests.cpp +++ b/level_zero/core/test/unit_tests/sources/helper/l0_hw_helper_tests.cpp @@ -629,5 +629,11 @@ HWTEST2_F(L0HwHelperTest, whenAlwaysAllocateEventInLocalMemCalledThenReturnFalse EXPECT_FALSE(l0HwHelper.alwaysAllocateEventInLocalMem()); } +TEST_F(L0HwHelperTest, givenL0HelperWhenGettingDefaultValueForUsePipeControlMultiKernelEventSyncThenReturnFalse) { + auto hwInfo = *NEO::defaultHwInfo.get(); + bool defaultValue = L0::L0HwHelper::usePipeControlMultiKernelEventSync(hwInfo); + EXPECT_FALSE(defaultValue); +} + } // namespace ult } // namespace L0 diff --git a/level_zero/core/test/unit_tests/xe_hpc_core/test_cmdlist_xe_hpc_core.cpp b/level_zero/core/test/unit_tests/xe_hpc_core/test_cmdlist_xe_hpc_core.cpp index fa792aec6e..9efb9a3fad 100644 --- a/level_zero/core/test/unit_tests/xe_hpc_core/test_cmdlist_xe_hpc_core.cpp +++ b/level_zero/core/test/unit_tests/xe_hpc_core/test_cmdlist_xe_hpc_core.cpp @@ -654,7 +654,7 @@ HWTEST2_F(CommandListAppendLaunchKernelXeHpcCore, result = commandList->appendPageFaultCopy(dstAllocation, srcAllocation, size, false); EXPECT_EQ(ZE_RESULT_SUCCESS, result); EXPECT_TRUE(commandList->usedKernelLaunchParams.isBuiltInKernel); - EXPECT_TRUE(commandList->usedKernelLaunchParams.isKernelSplitOperation); + EXPECT_FALSE(commandList->usedKernelLaunchParams.isKernelSplitOperation); EXPECT_TRUE(commandList->usedKernelLaunchParams.isDestinationAllocationInSystemMemory); GenCmdList commands; @@ -1144,7 +1144,8 @@ class MockAppendMemoryLockedCopyTestImmediateCmdList : public MockCommandListImm uint64_t srcOffset, uint64_t size, uint64_t elementSize, Builtin builtin, Event *signalEvent, - bool isStateless) override { + bool isStateless, + CmdListKernelLaunchParams &launchParams) override { appendMemoryCopyKernelWithGACalled++; return ZE_RESULT_SUCCESS; } diff --git a/shared/source/debug_settings/debug_variables_base.inl b/shared/source/debug_settings/debug_variables_base.inl index 20806d862e..a39fed1d91 100644 --- a/shared/source/debug_settings/debug_variables_base.inl +++ b/shared/source/debug_settings/debug_variables_base.inl @@ -417,6 +417,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, UseDrmCompletionFenceForAllAllocations, -1, "Use DECLARE_DEBUG_VARIABLE(int32_t, EnableChipsetUniqueUUID, -1, "Enables retrieving chipset unique UUID using telemetry, -1:default (enabled), 0:disable, 1:enable") DECLARE_DEBUG_VARIABLE(int32_t, EnableFlushTaskSubmission, -1, "Driver uses csr flushTask for immediate commandlist submissions, -1:default (enabled), 0:disabled, 1:enabled") DECLARE_DEBUG_VARIABLE(int32_t, EnableImmediateCmdListHeapSharing, -1, "Immediate command lists using flush task use current csr heap instead private cmd list heap, -1:default (disabled), 0:disabled, 1:enabled") +DECLARE_DEBUG_VARIABLE(int32_t, UsePipeControlMultiKernelEventSync, -1, "Use single PIPE_CONTROL for event signal of multi-kernel append operations instead multi-packet POSTSYNC_DATA from each COMPUTE_WALKER, -1: default , 0: disabled, 1: enabled") DECLARE_DEBUG_VARIABLE(int32_t, EnableBcsSwControlWa, -1, "Enable BCS WA via BCSSWCONTROL MMIO. -1: default, 0: disabled, 1: if src in system mem, 2: if dst in system mem, 3: if src and dst in system mem, 4: always") /* IMPLICIT SCALING */ diff --git a/shared/test/common/test_files/igdrcl.config b/shared/test/common/test_files/igdrcl.config index 9866790cc2..e1b3df4958 100644 --- a/shared/test/common/test_files/igdrcl.config +++ b/shared/test/common/test_files/igdrcl.config @@ -69,6 +69,7 @@ ForceAuxTranslationEnabled = -1 DisableTimestampPacketOptimizations = 0 DisableCachingForStatefulBufferAccess = 0 PrintDebugSettings = 0 +UsePipeControlMultiKernelEventSync = -1 PrintDebugMessages = 0 DumpKernels = 0 DumpKernelArgs = 0