Use pipe control to signal event of multi kernel operations

Related-To: NEO-7434

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz
2022-10-19 15:32:43 +00:00
committed by Compute-Runtime-Automation
parent 79386cd7f7
commit daa26701e4
18 changed files with 1747 additions and 601 deletions

View File

@@ -171,15 +171,22 @@ void CommandList::migrateSharedAllocations() {
}
}
bool CommandList::setupTimestampEventForMultiTile(Event *signalEvent) {
bool CommandList::isTimestampEventForMultiTile(Event *signalEvent) {
if (this->partitionCount > 1 &&
signalEvent) {
if (signalEvent->isEventTimestampFlagSet()) {
signalEvent->setPacketsInUse(this->partitionCount);
return true;
}
}
return false;
}
bool CommandList::setupTimestampEventForMultiTile(Event *signalEvent) {
if (isTimestampEventForMultiTile(signalEvent)) {
signalEvent->setPacketsInUse(this->partitionCount);
return true;
}
return false;
}
} // namespace L0

View File

@@ -308,6 +308,7 @@ struct CommandList : _ze_command_list_handle_t {
NEO::GraphicsAllocation *getAllocationFromHostPtrMap(const void *buffer, uint64_t bufferSize);
NEO::GraphicsAllocation *getHostPtrAlloc(const void *buffer, uint64_t bufferSize, bool hostCopyAllowed);
bool setupTimestampEventForMultiTile(Event *signalEvent);
bool isTimestampEventForMultiTile(Event *signalEvent);
bool getDcFlushRequired(bool externalCondition) const {
return externalCondition ? dcFlushSupport : false;
}

View File

@@ -43,6 +43,17 @@ struct AlignedAllocationData {
bool needsFlush = false;
};
struct CmdListFillKernelArguments {
size_t mainOffset = 0;
size_t mainGroupSize = 0;
size_t groups = 0;
size_t rightOffset = 0;
size_t patternOffsetRemainder = 0;
uint32_t leftRemainingBytes = 0;
uint32_t rightRemainingBytes = 0;
uint32_t patternSizeInEls = 0;
};
struct EventPool;
struct Event;
@@ -157,7 +168,7 @@ struct CommandListCoreFamily : CommandListImp {
uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override;
void appendMultiPartitionPrologue(uint32_t partitionDataSize) override;
void appendMultiPartitionEpilogue() override;
void appendEventForProfilingAllWalkers(Event *event, bool beforeWalker);
void appendEventForProfilingAllWalkers(Event *event, bool beforeWalker, bool singlePacketEvent);
ze_result_t addEventsToCmdList(uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents);
ze_result_t reserveSpace(size_t size, void **ptr) override;
@@ -173,7 +184,8 @@ struct CommandListCoreFamily : CommandListImp {
uint64_t srcOffset, uint64_t size,
uint64_t elementSize, Builtin builtin,
Event *signalEvent,
bool isStateless);
bool isStateless,
CmdListKernelLaunchParams &launchParams);
MOCKABLE_VIRTUAL ze_result_t appendMemoryCopyBlit(uintptr_t dstPtr,
NEO::GraphicsAllocation *dstPtrAlloc,
@@ -270,9 +282,15 @@ struct CommandListCoreFamily : CommandListImp {
void handlePostSubmissionState();
virtual void createLogicalStateHelper();
void setupFillKernelArguments(size_t baseOffset,
size_t patternSize,
size_t dstSize,
CmdListFillKernelArguments &outArguments,
Kernel *kernel);
size_t cmdListCurrentStartOffset = 0;
bool containsAnyKernel = false;
bool pipeControlMultiKernelEventSync = false;
};
template <PRODUCT_FAMILY gfxProductFamily>

View File

@@ -136,6 +136,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::initialize(Device *device, NEO
this->stateComputeModeTracking = L0HwHelper::enableStateComputeModeTracking(hwInfo);
this->frontEndStateTracking = L0HwHelper::enableFrontEndStateTracking(hwInfo);
this->pipelineSelectStateTracking = L0HwHelper::enablePipelineSelectStateTracking(hwInfo);
this->pipeControlMultiKernelEventSync = L0HwHelper::usePipeControlMultiKernelEventSync(hwInfo);
if (device->isImplicitScalingCapable() && !this->internalUsage && !isCopyOnly()) {
this->partitionCount = static_cast<uint32_t>(this->device->getNEODevice()->getDeviceBitfield().count());
@@ -926,7 +927,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyKernelWithGA(v
uint64_t elementSize,
Builtin builtin,
Event *signalEvent,
bool isStateless) {
bool isStateless,
CmdListKernelLaunchParams &launchParams) {
auto lock = device->getBuiltinFunctionsLib()->obtainUniqueOwnership();
@@ -957,8 +959,6 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyKernelWithGA(v
ze_group_count_t dispatchKernelArgs{groups, 1u, 1u};
auto dstAllocationType = dstPtrAlloc->getAllocationType();
CmdListKernelLaunchParams launchParams = {};
launchParams.isKernelSplitOperation = true;
launchParams.isBuiltInKernel = true;
launchParams.isDestinationAllocationInSystemMemory =
(dstAllocationType == NEO::AllocationType::BUFFER_HOST_MEMORY) ||
@@ -1088,6 +1088,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendPageFaultCopy(NEO::Graph
srcAddress, srcAllocation, 0u,
size);
} else {
CmdListKernelLaunchParams launchParams = {};
launchParams.isKernelSplitOperation = rightSize > 1;
ret = appendMemoryCopyKernelWithGA(reinterpret_cast<void *>(&dstAddress),
dstAllocation, 0,
reinterpret_cast<void *>(&srcAddress),
@@ -1096,7 +1098,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendPageFaultCopy(NEO::Graph
middleElSize,
Builtin::CopyBufferToBufferMiddle,
nullptr,
isStateless);
isStateless,
launchParams);
if (ret == ZE_RESULT_SUCCESS && rightSize) {
ret = appendMemoryCopyKernelWithGA(reinterpret_cast<void *>(&dstAddress),
dstAllocation, size - rightSize,
@@ -1105,7 +1108,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendPageFaultCopy(NEO::Graph
rightSize, 1UL,
Builtin::CopyBufferToBufferSide,
nullptr,
isStateless);
isStateless,
launchParams);
}
if (this->dcFlushSupport) {
@@ -1183,7 +1187,16 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(void *dstptr,
signalEvent = Event::fromHandle(hSignalEvent);
}
appendEventForProfilingAllWalkers(signalEvent, true);
uint32_t kernelCounter = leftSize > 0 ? 1 : 0;
kernelCounter += middleSizeBytes > 0 ? 1 : 0;
kernelCounter += rightSize > 0 ? 1 : 0;
CmdListKernelLaunchParams launchParams = {};
launchParams.isKernelSplitOperation = kernelCounter > 1;
bool singlePipeControlPacket = this->pipeControlMultiKernelEventSync && launchParams.isKernelSplitOperation;
appendEventForProfilingAllWalkers(signalEvent, true, singlePipeControlPacket);
if (ret == ZE_RESULT_SUCCESS && leftSize) {
Builtin copyKernel = Builtin::CopyBufferToBufferSide;
@@ -1203,7 +1216,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(void *dstptr,
leftSize, 1UL,
copyKernel,
signalEvent,
isStateless);
isStateless,
launchParams);
}
}
@@ -1226,7 +1240,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(void *dstptr,
middleElSize,
copyKernel,
signalEvent,
isStateless);
isStateless,
launchParams);
}
}
@@ -1248,11 +1263,12 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(void *dstptr,
rightSize, 1UL,
copyKernel,
signalEvent,
isStateless);
isStateless,
launchParams);
}
}
appendEventForProfilingAllWalkers(signalEvent, false);
appendEventForProfilingAllWalkers(signalEvent, false, singlePipeControlPacket);
addFlushRequiredCommand(dstAllocationStruct.needsFlush, signalEvent);
if (NEO::DebugManager.flags.EnableSWTags.get()) {
@@ -1564,86 +1580,70 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
}
auto lock = device->getBuiltinFunctionsLib()->obtainUniqueOwnership();
CmdListKernelLaunchParams launchParams = {};
launchParams.isKernelSplitOperation = true;
launchParams.isBuiltInKernel = true;
launchParams.isDestinationAllocationInSystemMemory = hostPointerNeedsFlush;
Kernel *builtinKernel = nullptr;
if (patternSize == 1) {
size_t middleSize = size;
uint32_t leftRemainder = sizeof(uint32_t) - (dstAllocation.offset % sizeof(uint32_t));
if (dstAllocation.offset % sizeof(uint32_t) != 0 && leftRemainder <= size) {
res = appendUnalignedFillKernel(isStateless, leftRemainder, dstAllocation, pattern, signalEvent, launchParams);
if (res) {
return res;
}
middleSize -= leftRemainder;
dstAllocation.offset += leftRemainder;
}
Kernel *builtinKernel = nullptr;
if (isStateless) {
builtinKernel = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferImmediateStateless);
} else {
builtinKernel = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferImmediate);
}
const auto dataTypeSize = sizeof(uint32_t) * 4;
size_t adjustedSize = middleSize / dataTypeSize;
size_t groupSizeX = device->getDeviceInfo().maxWorkGroupSize;
if (groupSizeX > adjustedSize && adjustedSize > 0) {
groupSizeX = adjustedSize;
} else {
if (isStateless) {
builtinKernel = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferMiddleStateless);
} else {
builtinKernel = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferMiddle);
}
if (builtinKernel->setGroupSize(static_cast<uint32_t>(groupSizeX), 1u, 1u)) {
}
CmdListKernelLaunchParams launchParams = {};
launchParams.isBuiltInKernel = true;
launchParams.isDestinationAllocationInSystemMemory = hostPointerNeedsFlush;
CmdListFillKernelArguments fillArguments = {};
setupFillKernelArguments(dstAllocation.offset, patternSize, size, fillArguments, builtinKernel);
launchParams.isKernelSplitOperation = (fillArguments.leftRemainingBytes > 0 || fillArguments.rightRemainingBytes > 0);
bool singlePipeControlPacket = this->pipeControlMultiKernelEventSync && launchParams.isKernelSplitOperation;
appendEventForProfilingAllWalkers(signalEvent, true, singlePipeControlPacket);
if (patternSize == 1) {
if (fillArguments.leftRemainingBytes > 0) {
res = appendUnalignedFillKernel(isStateless, fillArguments.leftRemainingBytes, dstAllocation, pattern, signalEvent, launchParams);
if (res) {
return res;
}
}
if (builtinKernel->setGroupSize(static_cast<uint32_t>(fillArguments.mainGroupSize), 1u, 1u)) {
DEBUG_BREAK_IF(true);
return ZE_RESULT_ERROR_UNKNOWN;
}
size_t groups = adjustedSize / groupSizeX;
uint32_t remainingBytes = static_cast<uint32_t>((adjustedSize % groupSizeX) * dataTypeSize +
middleSize % dataTypeSize);
ze_group_count_t dispatchKernelArgs{static_cast<uint32_t>(groups), 1u, 1u};
ze_group_count_t dispatchKernelArgs{static_cast<uint32_t>(fillArguments.groups), 1u, 1u};
uint32_t value = 0;
memset(&value, *reinterpret_cast<const unsigned char *>(pattern), 4);
builtinKernel->setArgBufferWithAlloc(0, dstAllocation.alignedAllocationPtr, dstAllocation.alloc);
builtinKernel->setArgumentValue(1, sizeof(dstAllocation.offset), &dstAllocation.offset);
builtinKernel->setArgumentValue(1, sizeof(fillArguments.mainOffset), &fillArguments.mainOffset);
builtinKernel->setArgumentValue(2, sizeof(value), &value);
appendEventForProfilingAllWalkers(signalEvent, true);
res = appendLaunchKernelSplit(builtinKernel, &dispatchKernelArgs, signalEvent, launchParams);
if (res) {
return res;
}
if (remainingBytes) {
dstAllocation.offset += (middleSize - remainingBytes);
res = appendUnalignedFillKernel(isStateless, remainingBytes, dstAllocation, pattern, signalEvent, launchParams);
if (fillArguments.rightRemainingBytes > 0) {
dstAllocation.offset = fillArguments.rightOffset;
res = appendUnalignedFillKernel(isStateless, fillArguments.rightRemainingBytes, dstAllocation, pattern, signalEvent, launchParams);
if (res) {
return res;
}
}
} else {
Kernel *builtinKernel = nullptr;
if (isStateless) {
builtinKernel = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferMiddleStateless);
} else {
builtinKernel = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferMiddle);
}
size_t middleElSize = sizeof(uint32_t);
size_t adjustedSize = size / middleElSize;
uint32_t groupSizeX = static_cast<uint32_t>(adjustedSize);
uint32_t groupSizeY = 1, groupSizeZ = 1;
builtinKernel->suggestGroupSize(groupSizeX, groupSizeY, groupSizeZ, &groupSizeX, &groupSizeY, &groupSizeZ);
builtinKernel->setGroupSize(groupSizeX, groupSizeY, groupSizeZ);
uint32_t groups = static_cast<uint32_t>(adjustedSize) / groupSizeX;
uint32_t remainingBytes = static_cast<uint32_t>((adjustedSize % groupSizeX) * middleElSize +
size % middleElSize);
builtinKernel->setGroupSize(static_cast<uint32_t>(fillArguments.mainGroupSize), 1, 1);
size_t patternAllocationSize = alignUp(patternSize, MemoryConstants::cacheLineSize);
uint32_t patternSizeInEls = static_cast<uint32_t>(patternAllocationSize / middleElSize);
auto patternGfxAlloc = device->obtainReusableAllocation(patternAllocationSize, NEO::AllocationType::FILL_PATTERN);
if (patternGfxAlloc == nullptr) {
patternGfxAlloc = device->getDriverHandle()->getMemoryManager()->allocateGraphicsMemoryWithProperties({device->getNEODevice()->getRootDeviceIndex(),
@@ -1666,22 +1666,21 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
patternAllocOffset += patternSizeToCopy;
} while (patternAllocOffset < patternAllocationSize);
builtinKernel->setArgBufferWithAlloc(0, dstAllocation.alignedAllocationPtr, dstAllocation.alloc);
builtinKernel->setArgumentValue(1, sizeof(dstAllocation.offset), &dstAllocation.offset);
builtinKernel->setArgBufferWithAlloc(2, reinterpret_cast<uintptr_t>(patternGfxAllocPtr), patternGfxAlloc);
builtinKernel->setArgumentValue(3, sizeof(patternSizeInEls), &patternSizeInEls);
builtinKernel->setArgumentValue(3, sizeof(fillArguments.patternSizeInEls), &fillArguments.patternSizeInEls);
appendEventForProfilingAllWalkers(signalEvent, true);
ze_group_count_t dispatchKernelArgs{groups, 1u, 1u};
ze_group_count_t dispatchKernelArgs{static_cast<uint32_t>(fillArguments.groups), 1u, 1u};
res = appendLaunchKernelSplit(builtinKernel, &dispatchKernelArgs, signalEvent, launchParams);
if (res) {
return res;
}
if (remainingBytes) {
uint32_t dstOffsetRemainder = groups * groupSizeX * static_cast<uint32_t>(middleElSize);
uint64_t patternOffsetRemainder = (groupSizeX * groups & (patternSizeInEls - 1)) * middleElSize;
if (fillArguments.rightRemainingBytes > 0) {
uint32_t dstOffsetRemainder = static_cast<uint32_t>(fillArguments.rightOffset);
uint64_t patternOffsetRemainder = fillArguments.patternOffsetRemainder;
Kernel *builtinKernelRemainder;
if (isStateless) {
@@ -1690,7 +1689,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
builtinKernelRemainder = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferRightLeftover);
}
builtinKernelRemainder->setGroupSize(remainingBytes, 1u, 1u);
builtinKernelRemainder->setGroupSize(fillArguments.rightRemainingBytes, 1u, 1u);
ze_group_count_t dispatchKernelArgs{1u, 1u, 1u};
builtinKernelRemainder->setArgBufferWithAlloc(0,
@@ -1711,7 +1710,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
}
}
appendEventForProfilingAllWalkers(signalEvent, false);
appendEventForProfilingAllWalkers(signalEvent, false, singlePipeControlPacket);
addFlushRequiredCommand(hostPointerNeedsFlush, signalEvent);
if (NEO::DebugManager.flags.EnableSWTags.get()) {
@@ -2544,4 +2543,57 @@ void CommandListCoreFamily<gfxCoreFamily>::addFlushRequiredCommand(bool flushOpe
}
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::setupFillKernelArguments(size_t baseOffset,
size_t patternSize,
size_t dstSize,
CmdListFillKernelArguments &outArguments,
Kernel *kernel) {
if (patternSize == 1) {
size_t middleSize = dstSize;
outArguments.mainOffset = baseOffset;
outArguments.leftRemainingBytes = sizeof(uint32_t) - (baseOffset % sizeof(uint32_t));
if (baseOffset % sizeof(uint32_t) != 0 && outArguments.leftRemainingBytes <= dstSize) {
middleSize -= outArguments.leftRemainingBytes;
outArguments.mainOffset += outArguments.leftRemainingBytes;
} else {
outArguments.leftRemainingBytes = 0;
}
const auto dataTypeSize = sizeof(uint32_t) * 4;
size_t adjustedSize = middleSize / dataTypeSize;
outArguments.mainGroupSize = this->device->getDeviceInfo().maxWorkGroupSize;
if (outArguments.mainGroupSize > adjustedSize && adjustedSize > 0) {
outArguments.mainGroupSize = adjustedSize;
}
outArguments.groups = adjustedSize / outArguments.mainGroupSize;
outArguments.rightRemainingBytes = static_cast<uint32_t>((adjustedSize % outArguments.mainGroupSize) * dataTypeSize +
middleSize % dataTypeSize);
if (outArguments.rightRemainingBytes > 0) {
outArguments.rightOffset = outArguments.mainOffset + (middleSize - outArguments.rightRemainingBytes);
}
} else {
size_t middleElSize = sizeof(uint32_t);
size_t adjustedSize = dstSize / middleElSize;
uint32_t groupSizeX = static_cast<uint32_t>(adjustedSize);
uint32_t groupSizeY = 1, groupSizeZ = 1;
kernel->suggestGroupSize(groupSizeX, groupSizeY, groupSizeZ, &groupSizeX, &groupSizeY, &groupSizeZ);
outArguments.mainGroupSize = groupSizeX;
outArguments.groups = static_cast<uint32_t>(adjustedSize) / outArguments.mainGroupSize;
outArguments.rightRemainingBytes = static_cast<uint32_t>((adjustedSize % outArguments.mainGroupSize) * middleElSize +
dstSize % middleElSize);
size_t patternAllocationSize = alignUp(patternSize, MemoryConstants::cacheLineSize);
outArguments.patternSizeInEls = static_cast<uint32_t>(patternAllocationSize / middleElSize);
if (outArguments.rightRemainingBytes > 0) {
outArguments.rightOffset = outArguments.groups * outArguments.mainGroupSize * middleElSize;
outArguments.patternOffsetRemainder = (outArguments.mainGroupSize * outArguments.groups & (outArguments.patternSizeInEls - 1)) * middleElSize;
}
}
}
} // namespace L0

View File

@@ -246,7 +246,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelSplit(Kernel
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::appendEventForProfilingAllWalkers(Event *event, bool beforeWalker) {
void CommandListCoreFamily<gfxCoreFamily>::appendEventForProfilingAllWalkers(Event *event, bool beforeWalker, bool singlePacketEvent) {
if (beforeWalker) {
appendEventForProfiling(event, true, false);
} else {

View File

@@ -404,25 +404,31 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelSplit(Kernel
Event *event,
const CmdListKernelLaunchParams &launchParams) {
if (event) {
event->increaseKernelCount();
if (this->pipeControlMultiKernelEventSync && launchParams.isKernelSplitOperation) {
event = nullptr;
} else {
event->increaseKernelCount();
}
}
return appendLaunchKernelWithParams(kernel, threadGroupDimensions, event, launchParams);
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::appendEventForProfilingAllWalkers(Event *event, bool beforeWalker) {
if (isCopyOnly()) {
void CommandListCoreFamily<gfxCoreFamily>::appendEventForProfilingAllWalkers(Event *event, bool beforeWalker, bool singlePacketEvent) {
if (isCopyOnly() || singlePacketEvent) {
if (beforeWalker) {
appendEventForProfiling(event, true, false);
bool workloadPartition = setupTimestampEventForMultiTile(event);
appendEventForProfiling(event, true, workloadPartition);
} else {
appendSignalEventPostWalker(event, false);
bool workloadPartition = isTimestampEventForMultiTile(event);
appendSignalEventPostWalker(event, workloadPartition);
}
} else {
if (event) {
if (beforeWalker) {
event->zeroKernelCount();
} else {
if (getDcFlushRequired(!!event->signalScope)) {
if (event->getKernelCount() > 1 && getDcFlushRequired(!!event->signalScope)) {
programEventL3Flush<gfxCoreFamily>(event, this->device, this->partitionCount, this->commandContainer);
}
}

View File

@@ -62,7 +62,7 @@ struct BcsSplit {
ze_result_t result = ZE_RESULT_SUCCESS;
if (hSignalEvent) {
cmdList->appendEventForProfilingAllWalkers(Event::fromHandle(hSignalEvent), true);
cmdList->appendEventForProfilingAllWalkers(Event::fromHandle(hSignalEvent), true, true);
}
auto markerEventIndex = this->events.obtainForSplit(Context::fromHandle(cmdList->hContext), MemoryConstants::pageSize64k / sizeof(typename CommandListCoreFamilyImmediate<gfxCoreFamily>::GfxFamily::TimestampPacketType));
@@ -86,10 +86,10 @@ struct BcsSplit {
}
cmdList->addEventsToCmdList(static_cast<uint32_t>(this->cmdQs.size()), eventHandles.data());
cmdList->appendEventForProfilingAllWalkers(this->events.marker[markerEventIndex], false);
cmdList->appendEventForProfilingAllWalkers(this->events.marker[markerEventIndex], false, true);
if (hSignalEvent) {
cmdList->appendEventForProfilingAllWalkers(Event::fromHandle(hSignalEvent), false);
cmdList->appendEventForProfilingAllWalkers(Event::fromHandle(hSignalEvent), false, true);
}
return result;

View File

@@ -46,4 +46,11 @@ bool L0HwHelper::enableImmediateCmdListHeapSharing(const NEO::HardwareInfo &hwIn
return platformSupport && cmdlistSupport;
}
bool L0HwHelper::usePipeControlMultiKernelEventSync(const NEO::HardwareInfo &hwInfo) {
if (NEO::DebugManager.flags.UsePipeControlMultiKernelEventSync.get() != -1) {
return !!NEO::DebugManager.flags.UsePipeControlMultiKernelEventSync.get();
}
return false;
}
} // namespace L0

View File

@@ -34,6 +34,7 @@ class L0HwHelper {
static bool enablePipelineSelectStateTracking(const NEO::HardwareInfo &hwInfo);
static bool enableStateComputeModeTracking(const NEO::HardwareInfo &hwInfo);
static bool enableImmediateCmdListHeapSharing(const NEO::HardwareInfo &hwInfo, bool cmdlistSupport);
static bool usePipeControlMultiKernelEventSync(const NEO::HardwareInfo &hwInfo);
virtual void setAdditionalGroupProperty(ze_command_queue_group_properties_t &groupProperty, NEO::EngineGroupT &group) const = 0;
virtual L0::Event *createEvent(L0::EventPool *eventPool, const ze_event_desc_t *desc, L0::Device *device) const = 0;

View File

@@ -65,6 +65,7 @@ struct WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>
using BaseClass::initialize;
using BaseClass::partitionCount;
using BaseClass::patternAllocations;
using BaseClass::pipeControlMultiKernelEventSync;
using BaseClass::pipelineSelectStateTracking;
using BaseClass::requiredStreamState;
using BaseClass::stateComputeModeTracking;
@@ -130,6 +131,7 @@ struct WhiteBox<L0::CommandListCoreFamilyImmediate<gfxCoreFamily>>
using BaseClass::immediateCmdListHeapSharing;
using BaseClass::isFlushTaskSubmissionEnabled;
using BaseClass::partitionCount;
using BaseClass::pipeControlMultiKernelEventSync;
using BaseClass::pipelineSelectStateTracking;
using BaseClass::requiredStreamState;
using BaseClass::stateComputeModeTracking;
@@ -143,6 +145,7 @@ struct MockCommandListImmediate : public CommandListCoreFamilyImmediate<gfxCoreF
using BaseClass::containsAnyKernel;
using BaseClass::immediateCmdListHeapSharing;
using BaseClass::indirectAllocationsAllowed;
using BaseClass::pipeControlMultiKernelEventSync;
using BaseClass::requiredStreamState;
};
@@ -422,8 +425,9 @@ class MockAppendMemoryCopy : public CommandListCoreFamily<gfxCoreFamily> {
uint64_t srcOffset, uint64_t size,
uint64_t elementSize, Builtin builtin,
Event *signalEvent,
bool isStateless),
(dstPtr, dstPtrAlloc, dstOffset, srcPtr, srcPtrAlloc, srcOffset, size, elementSize, builtin, signalEvent, isStateless));
bool isStateless,
CmdListKernelLaunchParams &launchParams),
(dstPtr, dstPtrAlloc, dstOffset, srcPtr, srcPtrAlloc, srcOffset, size, elementSize, builtin, signalEvent, isStateless, launchParams));
ADDMETHOD_NOBASE(appendMemoryCopyBlit, ze_result_t, ZE_RESULT_SUCCESS,
(uintptr_t dstPtr,

View File

@@ -50,7 +50,8 @@ class MockCommandListHw : public WhiteBox<::L0::CommandListCoreFamily<gfxCoreFam
uint64_t elementSize,
Builtin builtin,
Event *signalEvent,
bool isStateless) override {
bool isStateless,
CmdListKernelLaunchParams &launchParams) override {
appendMemoryCopyKernelWithGACalledTimes++;
if (isStateless) {
appendMemoryCopyKernelWithGAStatelessCalledTimes++;

View File

@@ -575,7 +575,7 @@ HWTEST2_F(CommandListTest, givenComputeCommandListWhenMemoryCopyInExternalHostAl
commandList->appendMemoryCopy(dstPtr, srcPtr, 8, nullptr, 0, nullptr);
EXPECT_TRUE(commandList->usedKernelLaunchParams.isBuiltInKernel);
EXPECT_TRUE(commandList->usedKernelLaunchParams.isKernelSplitOperation);
EXPECT_FALSE(commandList->usedKernelLaunchParams.isKernelSplitOperation);
EXPECT_TRUE(commandList->usedKernelLaunchParams.isDestinationAllocationInSystemMemory);
}
@@ -593,7 +593,7 @@ HWTEST2_F(CommandListTest, givenComputeCommandListWhenMemoryCopyInUsmHostAllocat
commandList->appendMemoryCopy(dstBuffer, srcPtr, 8, nullptr, 0, nullptr);
EXPECT_TRUE(commandList->usedKernelLaunchParams.isBuiltInKernel);
EXPECT_TRUE(commandList->usedKernelLaunchParams.isKernelSplitOperation);
EXPECT_FALSE(commandList->usedKernelLaunchParams.isKernelSplitOperation);
EXPECT_TRUE(commandList->usedKernelLaunchParams.isDestinationAllocationInSystemMemory);
context->freeMem(dstBuffer);
@@ -617,7 +617,7 @@ HWTEST2_F(CommandListTest, givenComputeCommandListWhenMemoryCopyInUsmDeviceAlloc
commandList->appendMemoryCopy(dstBuffer, srcPtr, 8, nullptr, 0, nullptr);
EXPECT_TRUE(commandList->usedKernelLaunchParams.isBuiltInKernel);
EXPECT_TRUE(commandList->usedKernelLaunchParams.isKernelSplitOperation);
EXPECT_FALSE(commandList->usedKernelLaunchParams.isKernelSplitOperation);
EXPECT_FALSE(commandList->usedKernelLaunchParams.isDestinationAllocationInSystemMemory);
context->freeMem(dstBuffer);
@@ -638,7 +638,12 @@ HWTEST2_F(CommandListTest, givenComputeCommandListWhenMemoryFillInUsmHostThenBui
commandList->appendMemoryFill(dstBuffer, pattern, patternSize, allocSize, nullptr, 0, nullptr);
EXPECT_TRUE(commandList->usedKernelLaunchParams.isBuiltInKernel);
EXPECT_TRUE(commandList->usedKernelLaunchParams.isKernelSplitOperation);
EXPECT_FALSE(commandList->usedKernelLaunchParams.isKernelSplitOperation);
EXPECT_TRUE(commandList->usedKernelLaunchParams.isDestinationAllocationInSystemMemory);
commandList->appendMemoryFill(dstBuffer, pattern, 1, allocSize, nullptr, 0, nullptr);
EXPECT_TRUE(commandList->usedKernelLaunchParams.isBuiltInKernel);
EXPECT_FALSE(commandList->usedKernelLaunchParams.isKernelSplitOperation);
EXPECT_TRUE(commandList->usedKernelLaunchParams.isDestinationAllocationInSystemMemory);
context->freeMem(dstBuffer);
@@ -663,6 +668,43 @@ HWTEST2_F(CommandListTest, givenComputeCommandListWhenMemoryFillInUsmDeviceThenB
commandList->appendMemoryFill(dstBuffer, pattern, patternSize, size, nullptr, 0, nullptr);
EXPECT_TRUE(commandList->usedKernelLaunchParams.isBuiltInKernel);
EXPECT_FALSE(commandList->usedKernelLaunchParams.isKernelSplitOperation);
EXPECT_FALSE(commandList->usedKernelLaunchParams.isDestinationAllocationInSystemMemory);
commandList->appendMemoryFill(dstBuffer, pattern, 1, size, nullptr, 0, nullptr);
EXPECT_TRUE(commandList->usedKernelLaunchParams.isBuiltInKernel);
EXPECT_FALSE(commandList->usedKernelLaunchParams.isKernelSplitOperation);
EXPECT_FALSE(commandList->usedKernelLaunchParams.isDestinationAllocationInSystemMemory);
context->freeMem(dstBuffer);
}
HWTEST2_F(CommandListTest, givenComputeCommandListWhenMemoryFillRequiresMultiKernelsThenSplitFlagIsSet, IsAtLeastSkl) {
auto commandList = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>>();
commandList->initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
constexpr size_t patternSize = 8;
uint8_t pattern[patternSize] = {1, 2, 3, 4};
constexpr size_t size = 4096u;
constexpr size_t alignment = 4096u;
void *dstBuffer = nullptr;
ze_device_mem_alloc_desc_t deviceDesc = {};
auto result = context->allocDeviceMem(device->toHandle(),
&deviceDesc,
size, alignment, &dstBuffer);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
constexpr size_t fillSize = size - 1;
commandList->appendMemoryFill(dstBuffer, pattern, patternSize, fillSize, nullptr, 0, nullptr);
EXPECT_TRUE(commandList->usedKernelLaunchParams.isBuiltInKernel);
EXPECT_TRUE(commandList->usedKernelLaunchParams.isKernelSplitOperation);
EXPECT_FALSE(commandList->usedKernelLaunchParams.isDestinationAllocationInSystemMemory);
commandList->appendMemoryFill(dstBuffer, pattern, 1, fillSize, nullptr, 0, nullptr);
EXPECT_TRUE(commandList->usedKernelLaunchParams.isBuiltInKernel);
EXPECT_TRUE(commandList->usedKernelLaunchParams.isKernelSplitOperation);
EXPECT_FALSE(commandList->usedKernelLaunchParams.isDestinationAllocationInSystemMemory);

View File

@@ -89,6 +89,8 @@ class AppendFillFixture : public DeviceFixture {
delete[] dstPtr;
}
DebugManagerStateRestore restorer;
std::unique_ptr<Mock<MockDriverFillHandle>> driverHandle;
NEO::MockDevice *neoDevice = nullptr;
L0::Device *device = nullptr;
@@ -108,8 +110,6 @@ struct MultiTileAppendFillFixture : public AppendFillFixture {
DebugManager.flags.EnableImplicitScaling.set(1);
AppendFillFixture::setUp();
}
DebugManagerStateRestore restorer;
};
using AppendFillTest = Test<AppendFillFixture>;
@@ -480,203 +480,612 @@ HWTEST2_F(AppendFillTest,
false);
}
HWTEST2_F(AppendFillTest,
givenCallToAppendMemoryFillWithImmediateValueWhenTimestampEventUsesComputeWalkerPostSyncThenSeparateKernelsUsesPostSyncProfiling, IsAtLeastXeHpCore) {
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
using COMPUTE_WALKER = typename GfxFamily::COMPUTE_WALKER;
using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA;
template <int32_t usePipeControlMultiPacketEventSync>
struct AppendFillMultiPacketEventFixture : public AppendFillFixture {
void setUp() {
DebugManager.flags.UsePipeControlMultiKernelEventSync.set(usePipeControlMultiPacketEventSync);
AppendFillFixture::setUp();
}
ze_event_pool_desc_t eventPoolDesc = {};
eventPoolDesc.count = 1;
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
template <GFXCORE_FAMILY gfxCoreFamily>
void testAppendMemoryFillManyImmediateKernels() {
using FamilyType = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER;
using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA;
using OPERATION = typename POSTSYNC_DATA::OPERATION;
ze_event_desc_t eventDesc = {};
eventDesc.index = 0;
ze_event_pool_desc_t eventPoolDesc = {};
eventPoolDesc.count = 1;
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
ze_result_t result = ZE_RESULT_SUCCESS;
auto eventPool = std::unique_ptr<L0::EventPool>(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result));
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto event = std::unique_ptr<L0::Event>(L0::Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
ze_event_desc_t eventDesc = {};
eventDesc.index = 0;
uint64_t firstKernelEventAddress = event->getGpuAddress(device);
uint64_t secondKernelEventAddress = event->getGpuAddress(device) + event->getSinglePacketSize();
ze_result_t result = ZE_RESULT_SUCCESS;
auto eventPool = std::unique_ptr<L0::EventPool>(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result));
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto event = std::unique_ptr<L0::Event>(L0::Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
auto commandList = std::make_unique<WhiteBox<MockCommandList<gfxCoreFamily>>>();
commandList->initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
auto &commandContainer = commandList->commandContainer;
uint64_t firstKernelEventAddress = postSyncAddressZero ? 0 : event->getGpuAddress(device);
uint64_t secondKernelEventAddress = postSyncAddressZero ? 0 : event->getGpuAddress(device) + event->getSinglePacketSize();
size_t usedBefore = commandContainer.getCommandStream()->getUsed();
result = commandList->appendMemoryFill(immediateDstPtr, &immediatePattern,
sizeof(immediatePattern),
immediateAllocSize, event->toHandle(), 0, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
size_t usedAfter = commandContainer.getCommandStream()->getUsed();
auto commandList = std::make_unique<WhiteBox<MockCommandList<gfxCoreFamily>>>();
commandList->initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
auto &commandContainer = commandList->commandContainer;
EXPECT_EQ(2u, event->getPacketsInUse());
EXPECT_EQ(2u, event->getKernelCount());
size_t usedBefore = commandContainer.getCommandStream()->getUsed();
result = commandList->appendMemoryFill(immediateDstPtr, &immediatePattern,
sizeof(immediatePattern),
immediateAllocSize, event->toHandle(), 0, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
size_t usedAfter = commandContainer.getCommandStream()->getUsed();
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList,
ptrOffset(commandContainer.getCommandStream()->getCpuBase(), usedBefore),
usedAfter - usedBefore));
EXPECT_EQ(expectedPacketsInUse, event->getPacketsInUse());
EXPECT_EQ(expectedKernelCount, event->getKernelCount());
auto itorWalkers = findAll<COMPUTE_WALKER *>(cmdList.begin(), cmdList.end());
ASSERT_EQ(2u, itorWalkers.size());
auto firstWalker = itorWalkers[0];
auto secondWalker = itorWalkers[1];
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList,
ptrOffset(commandContainer.getCommandStream()->getCpuBase(), usedBefore),
usedAfter - usedBefore));
auto walkerCmd = genCmdCast<COMPUTE_WALKER *>(*firstWalker);
EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation());
EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
auto itorWalkers = findAll<COMPUTE_WALKER *>(cmdList.begin(), cmdList.end());
ASSERT_EQ(2u, itorWalkers.size());
auto firstWalker = itorWalkers[0];
auto secondWalker = itorWalkers[1];
walkerCmd = genCmdCast<COMPUTE_WALKER *>(*secondWalker);
EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation());
EXPECT_EQ(secondKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
auto walkerCmd = genCmdCast<COMPUTE_WALKER *>(*firstWalker);
EXPECT_EQ(static_cast<OPERATION>(expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation());
EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
walkerCmd = genCmdCast<COMPUTE_WALKER *>(*secondWalker);
EXPECT_EQ(static_cast<OPERATION>(expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation());
EXPECT_EQ(secondKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
}
template <GFXCORE_FAMILY gfxCoreFamily>
void testAppendMemoryFillManyKernels() {
using FamilyType = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER;
using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA;
using OPERATION = typename POSTSYNC_DATA::OPERATION;
ze_event_pool_desc_t eventPoolDesc = {};
eventPoolDesc.count = 1;
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
ze_event_desc_t eventDesc = {};
eventDesc.index = 0;
ze_result_t result = ZE_RESULT_SUCCESS;
auto eventPool = std::unique_ptr<L0::EventPool>(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result));
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto event = std::unique_ptr<L0::Event>(L0::Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
uint64_t firstKernelEventAddress = postSyncAddressZero ? 0 : event->getGpuAddress(device);
uint64_t secondKernelEventAddress = postSyncAddressZero ? 0 : event->getGpuAddress(device) + event->getSinglePacketSize();
auto commandList = std::make_unique<WhiteBox<MockCommandList<gfxCoreFamily>>>();
commandList->initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
auto &commandContainer = commandList->commandContainer;
size_t usedBefore = commandContainer.getCommandStream()->getUsed();
result = commandList->appendMemoryFill(dstPtr, pattern, patternSize, allocSize, event->toHandle(), 0, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
size_t usedAfter = commandContainer.getCommandStream()->getUsed();
EXPECT_EQ(expectedPacketsInUse, event->getPacketsInUse());
EXPECT_EQ(expectedKernelCount, event->getKernelCount());
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList,
ptrOffset(commandContainer.getCommandStream()->getCpuBase(), usedBefore),
usedAfter - usedBefore));
auto itorWalkers = findAll<COMPUTE_WALKER *>(cmdList.begin(), cmdList.end());
ASSERT_EQ(2u, itorWalkers.size());
auto firstWalker = itorWalkers[0];
auto secondWalker = itorWalkers[1];
auto walkerCmd = genCmdCast<COMPUTE_WALKER *>(*firstWalker);
EXPECT_EQ(static_cast<OPERATION>(expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation());
EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
walkerCmd = genCmdCast<COMPUTE_WALKER *>(*secondWalker);
EXPECT_EQ(static_cast<OPERATION>(expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation());
EXPECT_EQ(secondKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
}
template <GFXCORE_FAMILY gfxCoreFamily>
void testAppendMemoryFillSingleKernel() {
using FamilyType = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER;
using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA;
using OPERATION = typename POSTSYNC_DATA::OPERATION;
ze_event_pool_desc_t eventPoolDesc = {};
eventPoolDesc.count = 1;
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
ze_event_desc_t eventDesc = {};
eventDesc.index = 0;
ze_result_t result = ZE_RESULT_SUCCESS;
auto eventPool = std::unique_ptr<L0::EventPool>(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result));
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto event = std::unique_ptr<L0::Event>(L0::Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
auto commandList = std::make_unique<WhiteBox<MockCommandList<gfxCoreFamily>>>();
commandList->initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
int pattern = 0;
const size_t size = 1024;
uint8_t array[size] = {};
auto &commandContainer = commandList->commandContainer;
size_t usedBefore = commandContainer.getCommandStream()->getUsed();
result = commandList->appendMemoryFill(array, &pattern, 1, size, event->toHandle(), 0, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
size_t usedAfter = commandContainer.getCommandStream()->getUsed();
EXPECT_EQ(expectedPacketsInUse, event->getPacketsInUse());
EXPECT_EQ(expectedKernelCount, event->getKernelCount());
uint64_t firstKernelEventAddress = postSyncAddressZero ? 0 : event->getGpuAddress(device);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList,
ptrOffset(commandContainer.getCommandStream()->getCpuBase(), usedBefore),
usedAfter - usedBefore));
auto itorWalkers = findAll<COMPUTE_WALKER *>(cmdList.begin(), cmdList.end());
ASSERT_EQ(1u, itorWalkers.size());
auto firstWalker = itorWalkers[0];
auto walkerCmd = genCmdCast<COMPUTE_WALKER *>(*firstWalker);
EXPECT_EQ(static_cast<OPERATION>(expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation());
EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
}
template <GFXCORE_FAMILY gfxCoreFamily>
void testAppendMemoryFillSingleKernelAndL3Flush() {
using FamilyType = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER;
using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA;
using OPERATION = typename POSTSYNC_DATA::OPERATION;
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION;
ze_event_pool_desc_t eventPoolDesc = {};
eventPoolDesc.count = 1;
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
ze_event_desc_t eventDesc = {};
eventDesc.index = 0;
eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
ze_result_t result = ZE_RESULT_SUCCESS;
auto eventPool = std::unique_ptr<L0::EventPool>(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result));
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto event = std::unique_ptr<L0::Event>(L0::Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
auto commandList = std::make_unique<WhiteBox<MockCommandList<gfxCoreFamily>>>();
commandList->initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
int pattern = 0;
const size_t size = 1024;
uint8_t array[size] = {};
auto &commandContainer = commandList->commandContainer;
size_t usedBefore = commandContainer.getCommandStream()->getUsed();
result = commandList->appendMemoryFill(array, &pattern, 1, size, event->toHandle(), 0, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
size_t usedAfter = commandContainer.getCommandStream()->getUsed();
EXPECT_EQ(expectedPacketsInUse, event->getPacketsInUse());
EXPECT_EQ(expectedKernelCount, event->getKernelCount());
uint64_t firstKernelEventAddress = postSyncAddressZero ? 0 : event->getGpuAddress(device);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList,
ptrOffset(commandContainer.getCommandStream()->getCpuBase(), usedBefore),
usedAfter - usedBefore));
auto itorWalkers = findAll<COMPUTE_WALKER *>(cmdList.begin(), cmdList.end());
ASSERT_EQ(1u, itorWalkers.size());
auto firstWalker = itorWalkers[0];
auto walkerCmd = genCmdCast<COMPUTE_WALKER *>(*firstWalker);
EXPECT_EQ(static_cast<OPERATION>(expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation());
EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
uint64_t l3FlushPostSyncAddress = firstKernelEventAddress + event->getSinglePacketSize();
if (event->isUsingContextEndOffset()) {
l3FlushPostSyncAddress += event->getContextEndOffset();
}
auto itorPipeControls = findAll<PIPE_CONTROL *>(firstWalker, cmdList.end());
uint32_t postSyncPipeControls = 0;
uint32_t dcFlushFound = 0;
for (auto it : itorPipeControls) {
auto cmd = genCmdCast<PIPE_CONTROL *>(*it);
if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) {
postSyncPipeControls++;
EXPECT_EQ(l3FlushPostSyncAddress, NEO::UnitTestHelper<FamilyType>::getPipeControlPostSyncAddress(*cmd));
EXPECT_EQ(Event::STATE_SIGNALED, cmd->getImmediateData());
}
if (cmd->getDcFlushEnable()) {
dcFlushFound++;
}
}
EXPECT_EQ(expectedPostSyncPipeControls, postSyncPipeControls);
EXPECT_EQ(1u, dcFlushFound);
}
uint32_t expectedPacketsInUse = 0;
uint32_t expectedKernelCount = 0;
uint32_t expectedWalkerPostSyncOp = 0;
uint32_t expectedPostSyncPipeControls = 0;
bool postSyncAddressZero = false;
};
using AppendFillMultiPacketEventTest = Test<AppendFillMultiPacketEventFixture<0>>;
using AppendFillSinglePacketEventTest = Test<AppendFillMultiPacketEventFixture<1>>;
HWTEST2_F(AppendFillMultiPacketEventTest,
givenCallToAppendMemoryFillWithImmediateValueWhenTimestampEventUsesComputeWalkerPostSyncThenSeparateKernelsUsesPostSyncProfiling,
IsAtLeastXeHpCore) {
expectedPacketsInUse = 2;
expectedKernelCount = 2;
expectedWalkerPostSyncOp = 3;
postSyncAddressZero = false;
testAppendMemoryFillManyImmediateKernels<gfxCoreFamily>();
}
HWTEST2_F(AppendFillTest,
givenCallToAppendMemoryFillWhenTimestampEventUsesComputeWalkerPostSyncThenSeparateKernelsUsesPostSyncProfiling, IsAtLeastXeHpCore) {
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
using COMPUTE_WALKER = typename GfxFamily::COMPUTE_WALKER;
using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA;
HWTEST2_F(AppendFillMultiPacketEventTest,
givenCallToAppendMemoryFillWhenTimestampEventUsesComputeWalkerPostSyncThenSeparateKernelsUsesWalkerPostSyncProfiling,
IsAtLeastXeHpCore) {
expectedPacketsInUse = 2;
expectedKernelCount = 2;
expectedWalkerPostSyncOp = 3;
postSyncAddressZero = false;
ze_event_pool_desc_t eventPoolDesc = {};
eventPoolDesc.count = 1;
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
ze_event_desc_t eventDesc = {};
eventDesc.index = 0;
ze_result_t result = ZE_RESULT_SUCCESS;
auto eventPool = std::unique_ptr<L0::EventPool>(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result));
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto event = std::unique_ptr<L0::Event>(L0::Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
uint64_t firstKernelEventAddress = event->getGpuAddress(device);
uint64_t secondKernelEventAddress = event->getGpuAddress(device) + event->getSinglePacketSize();
auto commandList = std::make_unique<WhiteBox<MockCommandList<gfxCoreFamily>>>();
commandList->initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
auto &commandContainer = commandList->commandContainer;
size_t usedBefore = commandContainer.getCommandStream()->getUsed();
result = commandList->appendMemoryFill(dstPtr, pattern, patternSize, allocSize, event->toHandle(), 0, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
size_t usedAfter = commandContainer.getCommandStream()->getUsed();
EXPECT_EQ(2u, event->getPacketsInUse());
EXPECT_EQ(2u, event->getKernelCount());
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList,
ptrOffset(commandContainer.getCommandStream()->getCpuBase(), usedBefore),
usedAfter - usedBefore));
auto itorWalkers = findAll<COMPUTE_WALKER *>(cmdList.begin(), cmdList.end());
ASSERT_EQ(2u, itorWalkers.size());
auto firstWalker = itorWalkers[0];
auto secondWalker = itorWalkers[1];
auto walkerCmd = genCmdCast<COMPUTE_WALKER *>(*firstWalker);
EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation());
EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
walkerCmd = genCmdCast<COMPUTE_WALKER *>(*secondWalker);
EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation());
EXPECT_EQ(secondKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
testAppendMemoryFillManyKernels<gfxCoreFamily>();
}
HWTEST2_F(MultiTileAppendFillTest,
givenMultiTileCmdListCallToAppendMemoryFillWhenSignalScopeTimestampEventUsesComputeWalkerPostSyncThenSeparateKernelsUsesPostSyncProfilingAndSingleDcFlushWithPostSync, IsAtLeastXeHpCore) {
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
using COMPUTE_WALKER = typename GfxFamily::COMPUTE_WALKER;
using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA;
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION;
HWTEST2_F(AppendFillMultiPacketEventTest,
givenAppendMemoryFillUsingSinglePacketEventWhenPatternDispatchOneKernelThenUseComputeWalkerPostSync,
IsAtLeastXeHpCore) {
expectedPacketsInUse = 1;
expectedKernelCount = 1;
expectedWalkerPostSyncOp = 3;
postSyncAddressZero = false;
ze_event_pool_desc_t eventPoolDesc = {};
eventPoolDesc.count = 1;
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
testAppendMemoryFillSingleKernel<gfxCoreFamily>();
}
ze_event_desc_t eventDesc = {};
eventDesc.index = 0;
eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
HWTEST2_F(AppendFillMultiPacketEventTest,
givenAppendMemoryFillUsingSinglePacketEventWhenPatternDispatchOneKernelThenUseComputeWalkerPostSyncAndL3PostSync,
IsXeHpOrXeHpgCore) {
expectedPacketsInUse = 2;
expectedKernelCount = 1;
expectedWalkerPostSyncOp = 3;
expectedPostSyncPipeControls = 1;
postSyncAddressZero = false;
ze_result_t result = ZE_RESULT_SUCCESS;
auto eventPool = std::unique_ptr<L0::EventPool>(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result));
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto event = std::unique_ptr<L0::Event>(L0::Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
testAppendMemoryFillSingleKernelAndL3Flush<gfxCoreFamily>();
}
uint64_t firstKernelEventAddress = event->getGpuAddress(device);
uint64_t secondKernelEventAddress = event->getGpuAddress(device) + 2 * event->getSinglePacketSize();
HWTEST2_F(AppendFillSinglePacketEventTest,
givenCallToAppendMemoryFillWithImmediateValueWhenTimestampEventUsesRegisterPostSyncThenSeparateKernelsNotUsesWalkerPostSyncProfiling,
IsAtLeastXeHpCore) {
expectedPacketsInUse = 1;
expectedKernelCount = 1;
expectedWalkerPostSyncOp = 0;
postSyncAddressZero = true;
auto commandList = std::make_unique<WhiteBox<MockCommandList<gfxCoreFamily>>>();
commandList->initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
EXPECT_EQ(2u, commandList->partitionCount);
auto &commandContainer = commandList->commandContainer;
testAppendMemoryFillManyImmediateKernels<gfxCoreFamily>();
}
size_t usedBefore = commandContainer.getCommandStream()->getUsed();
result = commandList->appendMemoryFill(dstPtr, pattern, patternSize, allocSize, event->toHandle(), 0, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
size_t usedAfter = commandContainer.getCommandStream()->getUsed();
HWTEST2_F(AppendFillSinglePacketEventTest,
givenCallToAppendMemoryFillWhenTimestampEventUsesRegisterPostSyncThenSeparateKernelsNotUsesWalkerPostSyncProfiling,
IsAtLeastXeHpCore) {
expectedPacketsInUse = 1;
expectedKernelCount = 1;
expectedWalkerPostSyncOp = 0;
postSyncAddressZero = true;
// two kernels and each kernel uses two packets (for two tiles), in total 4
uint32_t expectedPacketsInUse = 4;
testAppendMemoryFillManyKernels<gfxCoreFamily>();
}
uint32_t expectedDcFlush = 0;
HWTEST2_F(AppendFillSinglePacketEventTest,
givenAppendMemoryFillUsingSinglePacketEventWhenPatternDispatchOneKernelThenUseComputeWalkerPostSync,
IsAtLeastXeHpCore) {
expectedPacketsInUse = 1;
expectedKernelCount = 1;
expectedWalkerPostSyncOp = 3;
postSyncAddressZero = false;
testAppendMemoryFillSingleKernel<gfxCoreFamily>();
}
HWTEST2_F(AppendFillSinglePacketEventTest,
givenAppendMemoryFillUsingSinglePacketEventWhenPatternDispatchOneKernelThenUseComputeWalkerPostSyncAndL3PostSync,
IsXeHpOrXeHpgCore) {
expectedPacketsInUse = 2;
expectedKernelCount = 1;
expectedWalkerPostSyncOp = 3;
expectedPostSyncPipeControls = 1;
postSyncAddressZero = false;
testAppendMemoryFillSingleKernelAndL3Flush<gfxCoreFamily>();
}
template <int32_t usePipeControlMultiPacketEventSync>
struct MultiTileAppendFillMultiPacketEventFixture : public MultiTileAppendFillFixture {
void setUp() {
DebugManager.flags.UsePipeControlMultiKernelEventSync.set(usePipeControlMultiPacketEventSync);
MultiTileAppendFillFixture::setUp();
}
template <GFXCORE_FAMILY gfxCoreFamily>
void testAppendMemoryFillManyKernels(ze_event_pool_flags_t eventPoolFlags) {
using FamilyType = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER;
using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA;
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION;
using OPERATION = typename POSTSYNC_DATA::OPERATION;
ze_event_pool_desc_t eventPoolDesc = {};
eventPoolDesc.count = 1;
eventPoolDesc.flags = eventPoolFlags;
ze_event_desc_t eventDesc = {};
eventDesc.index = 0;
eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
ze_result_t result = ZE_RESULT_SUCCESS;
auto eventPool = std::unique_ptr<L0::EventPool>(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result));
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto event = std::unique_ptr<L0::Event>(L0::Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
uint64_t firstKernelEventAddress = postSyncAddressZero ? 0 : event->getGpuAddress(device);
uint64_t secondKernelEventAddress = postSyncAddressZero ? 0 : event->getGpuAddress(device) + 2 * event->getSinglePacketSize();
auto commandList = std::make_unique<WhiteBox<MockCommandList<gfxCoreFamily>>>();
commandList->initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
EXPECT_EQ(2u, commandList->partitionCount);
auto &commandContainer = commandList->commandContainer;
size_t usedBefore = commandContainer.getCommandStream()->getUsed();
result = commandList->appendMemoryFill(dstPtr, pattern, patternSize, allocSize, event->toHandle(), 0, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
size_t usedAfter = commandContainer.getCommandStream()->getUsed();
uint32_t expectedDcFlush = 0;
if (NEO::MemorySynchronizationCommands<FamilyType>::getDcFlushEnable(true, device->getHwInfo())) {
// 1st dc flush after cross-tile sync, 2nd dc flush for signal scope event
expectedDcFlush = 2;
}
EXPECT_EQ(expectedPacketsInUse, event->getPacketsInUse());
EXPECT_EQ(expectedKernelCount, event->getKernelCount());
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList,
ptrOffset(commandContainer.getCommandStream()->getCpuBase(), usedBefore),
usedAfter - usedBefore));
auto itorWalkers = findAll<COMPUTE_WALKER *>(cmdList.begin(), cmdList.end());
ASSERT_EQ(2u, itorWalkers.size());
auto firstWalker = itorWalkers[0];
auto secondWalker = itorWalkers[1];
auto walkerCmd = genCmdCast<COMPUTE_WALKER *>(*firstWalker);
EXPECT_EQ(static_cast<OPERATION>(expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation());
EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
walkerCmd = genCmdCast<COMPUTE_WALKER *>(*secondWalker);
EXPECT_EQ(static_cast<OPERATION>(expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation());
EXPECT_EQ(secondKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
auto itorPipeControls = findAll<PIPE_CONTROL *>(secondWalker, cmdList.end());
uint32_t postSyncPipeControls = 0;
uint32_t dcFlushFound = 0;
for (auto it : itorPipeControls) {
auto cmd = genCmdCast<PIPE_CONTROL *>(*it);
if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) {
EXPECT_EQ(Event::STATE_SIGNALED, cmd->getImmediateData());
postSyncPipeControls++;
}
if (cmd->getDcFlushEnable()) {
dcFlushFound++;
}
}
EXPECT_EQ(expectedPostSyncPipeControl, postSyncPipeControls);
EXPECT_EQ(expectedDcFlush, dcFlushFound);
}
template <GFXCORE_FAMILY gfxCoreFamily>
void testAppendMemoryFillSingleKernelAndL3Flush(ze_event_pool_flags_t eventPoolFlags) {
using FamilyType = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER;
using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA;
using OPERATION = typename POSTSYNC_DATA::OPERATION;
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION;
ze_event_pool_desc_t eventPoolDesc = {};
eventPoolDesc.count = 1;
eventPoolDesc.flags = eventPoolFlags;
ze_event_desc_t eventDesc = {};
eventDesc.index = 0;
eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
ze_result_t result = ZE_RESULT_SUCCESS;
auto eventPool = std::unique_ptr<L0::EventPool>(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result));
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto event = std::unique_ptr<L0::Event>(L0::Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
auto commandList = std::make_unique<WhiteBox<MockCommandList<gfxCoreFamily>>>();
commandList->initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
int pattern = 0;
const size_t size = 1024;
uint8_t array[size] = {};
auto &commandContainer = commandList->commandContainer;
size_t usedBefore = commandContainer.getCommandStream()->getUsed();
result = commandList->appendMemoryFill(array, &pattern, 1, size, event->toHandle(), 0, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
size_t usedAfter = commandContainer.getCommandStream()->getUsed();
EXPECT_EQ(expectedPacketsInUse, event->getPacketsInUse());
EXPECT_EQ(expectedKernelCount, event->getKernelCount());
uint64_t firstKernelEventAddress = postSyncAddressZero ? 0 : event->getGpuAddress(device);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList,
ptrOffset(commandContainer.getCommandStream()->getCpuBase(), usedBefore),
usedAfter - usedBefore));
auto itorWalkers = findAll<COMPUTE_WALKER *>(cmdList.begin(), cmdList.end());
ASSERT_EQ(1u, itorWalkers.size());
auto firstWalker = itorWalkers[0];
auto walkerCmd = genCmdCast<COMPUTE_WALKER *>(*firstWalker);
EXPECT_EQ(static_cast<OPERATION>(expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation());
EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
uint64_t l3FlushPostSyncAddress = firstKernelEventAddress + 2 * event->getSinglePacketSize();
if (event->isUsingContextEndOffset()) {
l3FlushPostSyncAddress += event->getContextEndOffset();
}
auto itorPipeControls = findAll<PIPE_CONTROL *>(firstWalker, cmdList.end());
uint32_t postSyncPipeControls = 0;
uint32_t dcFlushFound = 0;
for (auto it : itorPipeControls) {
auto cmd = genCmdCast<PIPE_CONTROL *>(*it);
if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) {
postSyncPipeControls++;
EXPECT_EQ(l3FlushPostSyncAddress, NEO::UnitTestHelper<FamilyType>::getPipeControlPostSyncAddress(*cmd));
EXPECT_EQ(Event::STATE_SIGNALED, cmd->getImmediateData());
}
if (cmd->getDcFlushEnable()) {
dcFlushFound++;
}
}
constexpr uint32_t expectedDcFlush = 2; // dc flush for last cross-tile sync and separately for signal scope event after last kernel split
EXPECT_EQ(expectedPostSyncPipeControl, postSyncPipeControls);
EXPECT_EQ(expectedDcFlush, dcFlushFound);
}
uint32_t expectedPacketsInUse = 0;
uint32_t expectedKernelCount = 0;
uint32_t expectedWalkerPostSyncOp = 0;
uint32_t expectedPostSyncPipeControl = 0;
bool postSyncAddressZero = false;
};
if (NEO::MemorySynchronizationCommands<GfxFamily>::getDcFlushEnable(true, device->getHwInfo())) {
//laster kernel uses 4 packets, in addition to kernel two packets, use 2 packets to two tile cache flush
using MultiTileAppendFillEventMultiPacketTest = Test<MultiTileAppendFillMultiPacketEventFixture<0>>;
using MultiTileAppendFillEventSinglePacketTest = Test<MultiTileAppendFillMultiPacketEventFixture<1>>;
HWTEST2_F(MultiTileAppendFillEventMultiPacketTest,
givenMultiTileCmdListCallToAppendMemoryFillWhenSignalScopeTimestampEventUsesComputeWalkerPostSyncThenSeparateKernelsUsesWalkerPostSyncProfilingAndSingleDcFlushWithImmediatePostSync, IsAtLeastXeHpCore) {
// two kernels and each kernel uses two packets (for two tiles), in total 4
expectedPacketsInUse = 4;
expectedKernelCount = 2;
expectedWalkerPostSyncOp = 3;
expectedPostSyncPipeControl = 0;
if (NEO::MemorySynchronizationCommands<FamilyType>::getDcFlushEnable(true, device->getHwInfo())) {
// last kernel uses 4 packets, in addition to kernel two packets, use 2 packets to two tile cache flush
expectedPacketsInUse = 6;
// 1st dc flush after cross-tile sync, 2nd dc flush for signal scope event
expectedDcFlush = 2;
//cache flush with event signal
// cache flush with event signal
expectedPostSyncPipeControl = 1;
}
postSyncAddressZero = false;
testAppendMemoryFillManyKernels<gfxCoreFamily>(ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP);
}
EXPECT_EQ(expectedPacketsInUse, event->getPacketsInUse());
EXPECT_EQ(2u, event->getKernelCount());
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList,
ptrOffset(commandContainer.getCommandStream()->getCpuBase(), usedBefore),
usedAfter - usedBefore));
auto itorWalkers = findAll<COMPUTE_WALKER *>(cmdList.begin(), cmdList.end());
ASSERT_EQ(2u, itorWalkers.size());
auto firstWalker = itorWalkers[0];
auto secondWalker = itorWalkers[1];
auto walkerCmd = genCmdCast<COMPUTE_WALKER *>(*firstWalker);
EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation());
EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
walkerCmd = genCmdCast<COMPUTE_WALKER *>(*secondWalker);
EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation());
EXPECT_EQ(secondKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
auto itorPipeControls = findAll<PIPE_CONTROL *>(secondWalker, cmdList.end());
uint32_t postSyncPipeControls = 0;
uint32_t dcFlushFound = 0;
for (auto it : itorPipeControls) {
auto cmd = genCmdCast<PIPE_CONTROL *>(*it);
if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) {
EXPECT_EQ(Event::STATE_SIGNALED, cmd->getImmediateData());
postSyncPipeControls++;
}
if (cmd->getDcFlushEnable()) {
dcFlushFound++;
}
HWTEST2_F(MultiTileAppendFillEventMultiPacketTest,
givenMultiTileCmdListCallToAppendMemoryFillWhenSignalScopeImmediateEventUsesComputeWalkerPostSyncThenSeparateKernelsUsesWalkerPostSyncAndSingleDcFlushWithPostSync, IsAtLeastXeHpCore) {
// two kernels and each kernel uses two packets (for two tiles), in total 4
expectedPacketsInUse = 4;
expectedKernelCount = 2;
expectedWalkerPostSyncOp = 3;
expectedPostSyncPipeControl = 0;
if (NEO::MemorySynchronizationCommands<FamilyType>::getDcFlushEnable(true, device->getHwInfo())) {
// last kernel uses 4 packets, in addition to kernel two packets, use 2 packets to two tile cache flush
expectedPacketsInUse = 6;
// cache flush with event signal
expectedPostSyncPipeControl = 1;
}
postSyncAddressZero = false;
testAppendMemoryFillManyKernels<gfxCoreFamily>(0);
}
EXPECT_EQ(expectedPostSyncPipeControl, postSyncPipeControls);
EXPECT_EQ(expectedDcFlush, dcFlushFound);
HWTEST2_F(MultiTileAppendFillEventMultiPacketTest,
givenMultiTileCmdListCallToAppendMemoryFillWhenSignalScopeTimestampEventUsesComputeWalkerPostSyncThenSingleKernelsUsesWalkerPostSyncProfilingAndSingleDcFlushWithImmediatePostSync, IsXeHpOrXeHpgCore) {
// kernel uses 4 packets, in addition to kernel two packets, use 2 packets to two tile cache flush
expectedPacketsInUse = 4;
expectedKernelCount = 1;
expectedWalkerPostSyncOp = 3;
// cache flush with event signal
expectedPostSyncPipeControl = 1;
postSyncAddressZero = false;
testAppendMemoryFillSingleKernelAndL3Flush<gfxCoreFamily>(ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP);
}
HWTEST2_F(MultiTileAppendFillEventMultiPacketTest,
givenMultiTileCmdListCallToAppendMemoryFillWhenSignalScopeImmediateEventUsesComputeWalkerPostSyncThenSingleKernelUsesWalkerPostSyncAndSingleDcFlushWithPostSync, IsXeHpOrXeHpgCore) {
// kernel uses 4 packets, in addition to kernel two packets, use 2 packets to two tile cache flush
expectedPacketsInUse = 4;
expectedKernelCount = 1;
expectedWalkerPostSyncOp = 3;
// cache flush with event signal
expectedPostSyncPipeControl = 1;
testAppendMemoryFillSingleKernelAndL3Flush<gfxCoreFamily>(0);
}
HWTEST2_F(MultiTileAppendFillEventSinglePacketTest,
givenMultiTileCmdListCallToAppendMemoryFillWhenSignalScopeTimestampEventUsesRegisterPostSyncThenSeparateKernelsNotUsesWalkerPostSyncProfilingAndDcFlushWithNoPostSync, IsAtLeastXeHpCore) {
expectedPacketsInUse = 2;
expectedKernelCount = 1;
expectedWalkerPostSyncOp = 0;
expectedPostSyncPipeControl = 0;
postSyncAddressZero = true;
testAppendMemoryFillManyKernels<gfxCoreFamily>(ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP);
}
HWTEST2_F(MultiTileAppendFillEventSinglePacketTest,
givenMultiTileCmdListCallToAppendMemoryFillWhenSignalScopeImmediateEventUsesPipeControlPostSyncThenSeparateKernelsNotUsesWalkerPostSyncProfilingAndDcFlushWithImmediatePostSync, IsAtLeastXeHpCore) {
expectedPacketsInUse = 2;
expectedKernelCount = 1;
expectedPacketsInUse = 2;
expectedWalkerPostSyncOp = 0;
expectedPostSyncPipeControl = 1;
postSyncAddressZero = true;
testAppendMemoryFillManyKernels<gfxCoreFamily>(0);
}
} // namespace ult

View File

@@ -629,5 +629,11 @@ HWTEST2_F(L0HwHelperTest, whenAlwaysAllocateEventInLocalMemCalledThenReturnFalse
EXPECT_FALSE(l0HwHelper.alwaysAllocateEventInLocalMem());
}
TEST_F(L0HwHelperTest, givenL0HelperWhenGettingDefaultValueForUsePipeControlMultiKernelEventSyncThenReturnFalse) {
auto hwInfo = *NEO::defaultHwInfo.get();
bool defaultValue = L0::L0HwHelper::usePipeControlMultiKernelEventSync(hwInfo);
EXPECT_FALSE(defaultValue);
}
} // namespace ult
} // namespace L0

View File

@@ -654,7 +654,7 @@ HWTEST2_F(CommandListAppendLaunchKernelXeHpcCore,
result = commandList->appendPageFaultCopy(dstAllocation, srcAllocation, size, false);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_TRUE(commandList->usedKernelLaunchParams.isBuiltInKernel);
EXPECT_TRUE(commandList->usedKernelLaunchParams.isKernelSplitOperation);
EXPECT_FALSE(commandList->usedKernelLaunchParams.isKernelSplitOperation);
EXPECT_TRUE(commandList->usedKernelLaunchParams.isDestinationAllocationInSystemMemory);
GenCmdList commands;
@@ -1144,7 +1144,8 @@ class MockAppendMemoryLockedCopyTestImmediateCmdList : public MockCommandListImm
uint64_t srcOffset, uint64_t size,
uint64_t elementSize, Builtin builtin,
Event *signalEvent,
bool isStateless) override {
bool isStateless,
CmdListKernelLaunchParams &launchParams) override {
appendMemoryCopyKernelWithGACalled++;
return ZE_RESULT_SUCCESS;
}

View File

@@ -417,6 +417,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, UseDrmCompletionFenceForAllAllocations, -1, "Use
DECLARE_DEBUG_VARIABLE(int32_t, EnableChipsetUniqueUUID, -1, "Enables retrieving chipset unique UUID using telemetry, -1:default (enabled), 0:disable, 1:enable")
DECLARE_DEBUG_VARIABLE(int32_t, EnableFlushTaskSubmission, -1, "Driver uses csr flushTask for immediate commandlist submissions, -1:default (enabled), 0:disabled, 1:enabled")
DECLARE_DEBUG_VARIABLE(int32_t, EnableImmediateCmdListHeapSharing, -1, "Immediate command lists using flush task use current csr heap instead private cmd list heap, -1:default (disabled), 0:disabled, 1:enabled")
DECLARE_DEBUG_VARIABLE(int32_t, UsePipeControlMultiKernelEventSync, -1, "Use single PIPE_CONTROL for event signal of multi-kernel append operations instead multi-packet POSTSYNC_DATA from each COMPUTE_WALKER, -1: default , 0: disabled, 1: enabled")
DECLARE_DEBUG_VARIABLE(int32_t, EnableBcsSwControlWa, -1, "Enable BCS WA via BCSSWCONTROL MMIO. -1: default, 0: disabled, 1: if src in system mem, 2: if dst in system mem, 3: if src and dst in system mem, 4: always")
/* IMPLICIT SCALING */

View File

@@ -69,6 +69,7 @@ ForceAuxTranslationEnabled = -1
DisableTimestampPacketOptimizations = 0
DisableCachingForStatefulBufferAccess = 0
PrintDebugSettings = 0
UsePipeControlMultiKernelEventSync = -1
PrintDebugMessages = 0
DumpKernels = 0
DumpKernelArgs = 0