mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-27 15:53:13 +08:00
Use pipe control to signal event of multi kernel operations
Related-To: NEO-7434 Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
79386cd7f7
commit
daa26701e4
@@ -171,15 +171,22 @@ void CommandList::migrateSharedAllocations() {
|
||||
}
|
||||
}
|
||||
|
||||
bool CommandList::setupTimestampEventForMultiTile(Event *signalEvent) {
|
||||
bool CommandList::isTimestampEventForMultiTile(Event *signalEvent) {
|
||||
if (this->partitionCount > 1 &&
|
||||
signalEvent) {
|
||||
if (signalEvent->isEventTimestampFlagSet()) {
|
||||
signalEvent->setPacketsInUse(this->partitionCount);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool CommandList::setupTimestampEventForMultiTile(Event *signalEvent) {
|
||||
if (isTimestampEventForMultiTile(signalEvent)) {
|
||||
signalEvent->setPacketsInUse(this->partitionCount);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
} // namespace L0
|
||||
|
||||
@@ -308,6 +308,7 @@ struct CommandList : _ze_command_list_handle_t {
|
||||
NEO::GraphicsAllocation *getAllocationFromHostPtrMap(const void *buffer, uint64_t bufferSize);
|
||||
NEO::GraphicsAllocation *getHostPtrAlloc(const void *buffer, uint64_t bufferSize, bool hostCopyAllowed);
|
||||
bool setupTimestampEventForMultiTile(Event *signalEvent);
|
||||
bool isTimestampEventForMultiTile(Event *signalEvent);
|
||||
bool getDcFlushRequired(bool externalCondition) const {
|
||||
return externalCondition ? dcFlushSupport : false;
|
||||
}
|
||||
|
||||
@@ -43,6 +43,17 @@ struct AlignedAllocationData {
|
||||
bool needsFlush = false;
|
||||
};
|
||||
|
||||
struct CmdListFillKernelArguments {
|
||||
size_t mainOffset = 0;
|
||||
size_t mainGroupSize = 0;
|
||||
size_t groups = 0;
|
||||
size_t rightOffset = 0;
|
||||
size_t patternOffsetRemainder = 0;
|
||||
uint32_t leftRemainingBytes = 0;
|
||||
uint32_t rightRemainingBytes = 0;
|
||||
uint32_t patternSizeInEls = 0;
|
||||
};
|
||||
|
||||
struct EventPool;
|
||||
struct Event;
|
||||
|
||||
@@ -157,7 +168,7 @@ struct CommandListCoreFamily : CommandListImp {
|
||||
uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override;
|
||||
void appendMultiPartitionPrologue(uint32_t partitionDataSize) override;
|
||||
void appendMultiPartitionEpilogue() override;
|
||||
void appendEventForProfilingAllWalkers(Event *event, bool beforeWalker);
|
||||
void appendEventForProfilingAllWalkers(Event *event, bool beforeWalker, bool singlePacketEvent);
|
||||
ze_result_t addEventsToCmdList(uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents);
|
||||
|
||||
ze_result_t reserveSpace(size_t size, void **ptr) override;
|
||||
@@ -173,7 +184,8 @@ struct CommandListCoreFamily : CommandListImp {
|
||||
uint64_t srcOffset, uint64_t size,
|
||||
uint64_t elementSize, Builtin builtin,
|
||||
Event *signalEvent,
|
||||
bool isStateless);
|
||||
bool isStateless,
|
||||
CmdListKernelLaunchParams &launchParams);
|
||||
|
||||
MOCKABLE_VIRTUAL ze_result_t appendMemoryCopyBlit(uintptr_t dstPtr,
|
||||
NEO::GraphicsAllocation *dstPtrAlloc,
|
||||
@@ -270,9 +282,15 @@ struct CommandListCoreFamily : CommandListImp {
|
||||
void handlePostSubmissionState();
|
||||
|
||||
virtual void createLogicalStateHelper();
|
||||
void setupFillKernelArguments(size_t baseOffset,
|
||||
size_t patternSize,
|
||||
size_t dstSize,
|
||||
CmdListFillKernelArguments &outArguments,
|
||||
Kernel *kernel);
|
||||
|
||||
size_t cmdListCurrentStartOffset = 0;
|
||||
bool containsAnyKernel = false;
|
||||
bool pipeControlMultiKernelEventSync = false;
|
||||
};
|
||||
|
||||
template <PRODUCT_FAMILY gfxProductFamily>
|
||||
|
||||
@@ -136,6 +136,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::initialize(Device *device, NEO
|
||||
this->stateComputeModeTracking = L0HwHelper::enableStateComputeModeTracking(hwInfo);
|
||||
this->frontEndStateTracking = L0HwHelper::enableFrontEndStateTracking(hwInfo);
|
||||
this->pipelineSelectStateTracking = L0HwHelper::enablePipelineSelectStateTracking(hwInfo);
|
||||
this->pipeControlMultiKernelEventSync = L0HwHelper::usePipeControlMultiKernelEventSync(hwInfo);
|
||||
|
||||
if (device->isImplicitScalingCapable() && !this->internalUsage && !isCopyOnly()) {
|
||||
this->partitionCount = static_cast<uint32_t>(this->device->getNEODevice()->getDeviceBitfield().count());
|
||||
@@ -926,7 +927,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyKernelWithGA(v
|
||||
uint64_t elementSize,
|
||||
Builtin builtin,
|
||||
Event *signalEvent,
|
||||
bool isStateless) {
|
||||
bool isStateless,
|
||||
CmdListKernelLaunchParams &launchParams) {
|
||||
|
||||
auto lock = device->getBuiltinFunctionsLib()->obtainUniqueOwnership();
|
||||
|
||||
@@ -957,8 +959,6 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyKernelWithGA(v
|
||||
ze_group_count_t dispatchKernelArgs{groups, 1u, 1u};
|
||||
|
||||
auto dstAllocationType = dstPtrAlloc->getAllocationType();
|
||||
CmdListKernelLaunchParams launchParams = {};
|
||||
launchParams.isKernelSplitOperation = true;
|
||||
launchParams.isBuiltInKernel = true;
|
||||
launchParams.isDestinationAllocationInSystemMemory =
|
||||
(dstAllocationType == NEO::AllocationType::BUFFER_HOST_MEMORY) ||
|
||||
@@ -1088,6 +1088,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendPageFaultCopy(NEO::Graph
|
||||
srcAddress, srcAllocation, 0u,
|
||||
size);
|
||||
} else {
|
||||
CmdListKernelLaunchParams launchParams = {};
|
||||
launchParams.isKernelSplitOperation = rightSize > 1;
|
||||
ret = appendMemoryCopyKernelWithGA(reinterpret_cast<void *>(&dstAddress),
|
||||
dstAllocation, 0,
|
||||
reinterpret_cast<void *>(&srcAddress),
|
||||
@@ -1096,7 +1098,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendPageFaultCopy(NEO::Graph
|
||||
middleElSize,
|
||||
Builtin::CopyBufferToBufferMiddle,
|
||||
nullptr,
|
||||
isStateless);
|
||||
isStateless,
|
||||
launchParams);
|
||||
if (ret == ZE_RESULT_SUCCESS && rightSize) {
|
||||
ret = appendMemoryCopyKernelWithGA(reinterpret_cast<void *>(&dstAddress),
|
||||
dstAllocation, size - rightSize,
|
||||
@@ -1105,7 +1108,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendPageFaultCopy(NEO::Graph
|
||||
rightSize, 1UL,
|
||||
Builtin::CopyBufferToBufferSide,
|
||||
nullptr,
|
||||
isStateless);
|
||||
isStateless,
|
||||
launchParams);
|
||||
}
|
||||
|
||||
if (this->dcFlushSupport) {
|
||||
@@ -1183,7 +1187,16 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(void *dstptr,
|
||||
signalEvent = Event::fromHandle(hSignalEvent);
|
||||
}
|
||||
|
||||
appendEventForProfilingAllWalkers(signalEvent, true);
|
||||
uint32_t kernelCounter = leftSize > 0 ? 1 : 0;
|
||||
kernelCounter += middleSizeBytes > 0 ? 1 : 0;
|
||||
kernelCounter += rightSize > 0 ? 1 : 0;
|
||||
|
||||
CmdListKernelLaunchParams launchParams = {};
|
||||
|
||||
launchParams.isKernelSplitOperation = kernelCounter > 1;
|
||||
bool singlePipeControlPacket = this->pipeControlMultiKernelEventSync && launchParams.isKernelSplitOperation;
|
||||
|
||||
appendEventForProfilingAllWalkers(signalEvent, true, singlePipeControlPacket);
|
||||
|
||||
if (ret == ZE_RESULT_SUCCESS && leftSize) {
|
||||
Builtin copyKernel = Builtin::CopyBufferToBufferSide;
|
||||
@@ -1203,7 +1216,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(void *dstptr,
|
||||
leftSize, 1UL,
|
||||
copyKernel,
|
||||
signalEvent,
|
||||
isStateless);
|
||||
isStateless,
|
||||
launchParams);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1226,7 +1240,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(void *dstptr,
|
||||
middleElSize,
|
||||
copyKernel,
|
||||
signalEvent,
|
||||
isStateless);
|
||||
isStateless,
|
||||
launchParams);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1248,11 +1263,12 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(void *dstptr,
|
||||
rightSize, 1UL,
|
||||
copyKernel,
|
||||
signalEvent,
|
||||
isStateless);
|
||||
isStateless,
|
||||
launchParams);
|
||||
}
|
||||
}
|
||||
|
||||
appendEventForProfilingAllWalkers(signalEvent, false);
|
||||
appendEventForProfilingAllWalkers(signalEvent, false, singlePipeControlPacket);
|
||||
addFlushRequiredCommand(dstAllocationStruct.needsFlush, signalEvent);
|
||||
|
||||
if (NEO::DebugManager.flags.EnableSWTags.get()) {
|
||||
@@ -1564,86 +1580,70 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
|
||||
}
|
||||
auto lock = device->getBuiltinFunctionsLib()->obtainUniqueOwnership();
|
||||
|
||||
CmdListKernelLaunchParams launchParams = {};
|
||||
launchParams.isKernelSplitOperation = true;
|
||||
launchParams.isBuiltInKernel = true;
|
||||
launchParams.isDestinationAllocationInSystemMemory = hostPointerNeedsFlush;
|
||||
Kernel *builtinKernel = nullptr;
|
||||
if (patternSize == 1) {
|
||||
size_t middleSize = size;
|
||||
uint32_t leftRemainder = sizeof(uint32_t) - (dstAllocation.offset % sizeof(uint32_t));
|
||||
if (dstAllocation.offset % sizeof(uint32_t) != 0 && leftRemainder <= size) {
|
||||
res = appendUnalignedFillKernel(isStateless, leftRemainder, dstAllocation, pattern, signalEvent, launchParams);
|
||||
if (res) {
|
||||
return res;
|
||||
}
|
||||
middleSize -= leftRemainder;
|
||||
dstAllocation.offset += leftRemainder;
|
||||
}
|
||||
Kernel *builtinKernel = nullptr;
|
||||
|
||||
if (isStateless) {
|
||||
builtinKernel = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferImmediateStateless);
|
||||
} else {
|
||||
builtinKernel = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferImmediate);
|
||||
}
|
||||
const auto dataTypeSize = sizeof(uint32_t) * 4;
|
||||
size_t adjustedSize = middleSize / dataTypeSize;
|
||||
size_t groupSizeX = device->getDeviceInfo().maxWorkGroupSize;
|
||||
if (groupSizeX > adjustedSize && adjustedSize > 0) {
|
||||
groupSizeX = adjustedSize;
|
||||
} else {
|
||||
if (isStateless) {
|
||||
builtinKernel = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferMiddleStateless);
|
||||
} else {
|
||||
builtinKernel = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferMiddle);
|
||||
}
|
||||
if (builtinKernel->setGroupSize(static_cast<uint32_t>(groupSizeX), 1u, 1u)) {
|
||||
}
|
||||
|
||||
CmdListKernelLaunchParams launchParams = {};
|
||||
launchParams.isBuiltInKernel = true;
|
||||
launchParams.isDestinationAllocationInSystemMemory = hostPointerNeedsFlush;
|
||||
|
||||
CmdListFillKernelArguments fillArguments = {};
|
||||
setupFillKernelArguments(dstAllocation.offset, patternSize, size, fillArguments, builtinKernel);
|
||||
|
||||
launchParams.isKernelSplitOperation = (fillArguments.leftRemainingBytes > 0 || fillArguments.rightRemainingBytes > 0);
|
||||
bool singlePipeControlPacket = this->pipeControlMultiKernelEventSync && launchParams.isKernelSplitOperation;
|
||||
|
||||
appendEventForProfilingAllWalkers(signalEvent, true, singlePipeControlPacket);
|
||||
|
||||
if (patternSize == 1) {
|
||||
if (fillArguments.leftRemainingBytes > 0) {
|
||||
res = appendUnalignedFillKernel(isStateless, fillArguments.leftRemainingBytes, dstAllocation, pattern, signalEvent, launchParams);
|
||||
if (res) {
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
if (builtinKernel->setGroupSize(static_cast<uint32_t>(fillArguments.mainGroupSize), 1u, 1u)) {
|
||||
DEBUG_BREAK_IF(true);
|
||||
return ZE_RESULT_ERROR_UNKNOWN;
|
||||
}
|
||||
|
||||
size_t groups = adjustedSize / groupSizeX;
|
||||
uint32_t remainingBytes = static_cast<uint32_t>((adjustedSize % groupSizeX) * dataTypeSize +
|
||||
middleSize % dataTypeSize);
|
||||
ze_group_count_t dispatchKernelArgs{static_cast<uint32_t>(groups), 1u, 1u};
|
||||
ze_group_count_t dispatchKernelArgs{static_cast<uint32_t>(fillArguments.groups), 1u, 1u};
|
||||
|
||||
uint32_t value = 0;
|
||||
memset(&value, *reinterpret_cast<const unsigned char *>(pattern), 4);
|
||||
builtinKernel->setArgBufferWithAlloc(0, dstAllocation.alignedAllocationPtr, dstAllocation.alloc);
|
||||
builtinKernel->setArgumentValue(1, sizeof(dstAllocation.offset), &dstAllocation.offset);
|
||||
builtinKernel->setArgumentValue(1, sizeof(fillArguments.mainOffset), &fillArguments.mainOffset);
|
||||
builtinKernel->setArgumentValue(2, sizeof(value), &value);
|
||||
|
||||
appendEventForProfilingAllWalkers(signalEvent, true);
|
||||
|
||||
res = appendLaunchKernelSplit(builtinKernel, &dispatchKernelArgs, signalEvent, launchParams);
|
||||
if (res) {
|
||||
return res;
|
||||
}
|
||||
|
||||
if (remainingBytes) {
|
||||
dstAllocation.offset += (middleSize - remainingBytes);
|
||||
res = appendUnalignedFillKernel(isStateless, remainingBytes, dstAllocation, pattern, signalEvent, launchParams);
|
||||
if (fillArguments.rightRemainingBytes > 0) {
|
||||
dstAllocation.offset = fillArguments.rightOffset;
|
||||
res = appendUnalignedFillKernel(isStateless, fillArguments.rightRemainingBytes, dstAllocation, pattern, signalEvent, launchParams);
|
||||
if (res) {
|
||||
return res;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
||||
Kernel *builtinKernel = nullptr;
|
||||
if (isStateless) {
|
||||
builtinKernel = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferMiddleStateless);
|
||||
} else {
|
||||
builtinKernel = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferMiddle);
|
||||
}
|
||||
size_t middleElSize = sizeof(uint32_t);
|
||||
size_t adjustedSize = size / middleElSize;
|
||||
uint32_t groupSizeX = static_cast<uint32_t>(adjustedSize);
|
||||
uint32_t groupSizeY = 1, groupSizeZ = 1;
|
||||
builtinKernel->suggestGroupSize(groupSizeX, groupSizeY, groupSizeZ, &groupSizeX, &groupSizeY, &groupSizeZ);
|
||||
builtinKernel->setGroupSize(groupSizeX, groupSizeY, groupSizeZ);
|
||||
|
||||
uint32_t groups = static_cast<uint32_t>(adjustedSize) / groupSizeX;
|
||||
uint32_t remainingBytes = static_cast<uint32_t>((adjustedSize % groupSizeX) * middleElSize +
|
||||
size % middleElSize);
|
||||
builtinKernel->setGroupSize(static_cast<uint32_t>(fillArguments.mainGroupSize), 1, 1);
|
||||
|
||||
size_t patternAllocationSize = alignUp(patternSize, MemoryConstants::cacheLineSize);
|
||||
uint32_t patternSizeInEls = static_cast<uint32_t>(patternAllocationSize / middleElSize);
|
||||
|
||||
auto patternGfxAlloc = device->obtainReusableAllocation(patternAllocationSize, NEO::AllocationType::FILL_PATTERN);
|
||||
if (patternGfxAlloc == nullptr) {
|
||||
patternGfxAlloc = device->getDriverHandle()->getMemoryManager()->allocateGraphicsMemoryWithProperties({device->getNEODevice()->getRootDeviceIndex(),
|
||||
@@ -1666,22 +1666,21 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
|
||||
|
||||
patternAllocOffset += patternSizeToCopy;
|
||||
} while (patternAllocOffset < patternAllocationSize);
|
||||
|
||||
builtinKernel->setArgBufferWithAlloc(0, dstAllocation.alignedAllocationPtr, dstAllocation.alloc);
|
||||
builtinKernel->setArgumentValue(1, sizeof(dstAllocation.offset), &dstAllocation.offset);
|
||||
builtinKernel->setArgBufferWithAlloc(2, reinterpret_cast<uintptr_t>(patternGfxAllocPtr), patternGfxAlloc);
|
||||
builtinKernel->setArgumentValue(3, sizeof(patternSizeInEls), &patternSizeInEls);
|
||||
builtinKernel->setArgumentValue(3, sizeof(fillArguments.patternSizeInEls), &fillArguments.patternSizeInEls);
|
||||
|
||||
appendEventForProfilingAllWalkers(signalEvent, true);
|
||||
|
||||
ze_group_count_t dispatchKernelArgs{groups, 1u, 1u};
|
||||
ze_group_count_t dispatchKernelArgs{static_cast<uint32_t>(fillArguments.groups), 1u, 1u};
|
||||
res = appendLaunchKernelSplit(builtinKernel, &dispatchKernelArgs, signalEvent, launchParams);
|
||||
if (res) {
|
||||
return res;
|
||||
}
|
||||
|
||||
if (remainingBytes) {
|
||||
uint32_t dstOffsetRemainder = groups * groupSizeX * static_cast<uint32_t>(middleElSize);
|
||||
uint64_t patternOffsetRemainder = (groupSizeX * groups & (patternSizeInEls - 1)) * middleElSize;
|
||||
if (fillArguments.rightRemainingBytes > 0) {
|
||||
uint32_t dstOffsetRemainder = static_cast<uint32_t>(fillArguments.rightOffset);
|
||||
uint64_t patternOffsetRemainder = fillArguments.patternOffsetRemainder;
|
||||
|
||||
Kernel *builtinKernelRemainder;
|
||||
if (isStateless) {
|
||||
@@ -1690,7 +1689,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
|
||||
builtinKernelRemainder = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferRightLeftover);
|
||||
}
|
||||
|
||||
builtinKernelRemainder->setGroupSize(remainingBytes, 1u, 1u);
|
||||
builtinKernelRemainder->setGroupSize(fillArguments.rightRemainingBytes, 1u, 1u);
|
||||
ze_group_count_t dispatchKernelArgs{1u, 1u, 1u};
|
||||
|
||||
builtinKernelRemainder->setArgBufferWithAlloc(0,
|
||||
@@ -1711,7 +1710,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
|
||||
}
|
||||
}
|
||||
|
||||
appendEventForProfilingAllWalkers(signalEvent, false);
|
||||
appendEventForProfilingAllWalkers(signalEvent, false, singlePipeControlPacket);
|
||||
addFlushRequiredCommand(hostPointerNeedsFlush, signalEvent);
|
||||
|
||||
if (NEO::DebugManager.flags.EnableSWTags.get()) {
|
||||
@@ -2544,4 +2543,57 @@ void CommandListCoreFamily<gfxCoreFamily>::addFlushRequiredCommand(bool flushOpe
|
||||
}
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
void CommandListCoreFamily<gfxCoreFamily>::setupFillKernelArguments(size_t baseOffset,
|
||||
size_t patternSize,
|
||||
size_t dstSize,
|
||||
CmdListFillKernelArguments &outArguments,
|
||||
Kernel *kernel) {
|
||||
if (patternSize == 1) {
|
||||
size_t middleSize = dstSize;
|
||||
outArguments.mainOffset = baseOffset;
|
||||
outArguments.leftRemainingBytes = sizeof(uint32_t) - (baseOffset % sizeof(uint32_t));
|
||||
if (baseOffset % sizeof(uint32_t) != 0 && outArguments.leftRemainingBytes <= dstSize) {
|
||||
middleSize -= outArguments.leftRemainingBytes;
|
||||
outArguments.mainOffset += outArguments.leftRemainingBytes;
|
||||
} else {
|
||||
outArguments.leftRemainingBytes = 0;
|
||||
}
|
||||
|
||||
const auto dataTypeSize = sizeof(uint32_t) * 4;
|
||||
size_t adjustedSize = middleSize / dataTypeSize;
|
||||
outArguments.mainGroupSize = this->device->getDeviceInfo().maxWorkGroupSize;
|
||||
if (outArguments.mainGroupSize > adjustedSize && adjustedSize > 0) {
|
||||
outArguments.mainGroupSize = adjustedSize;
|
||||
}
|
||||
|
||||
outArguments.groups = adjustedSize / outArguments.mainGroupSize;
|
||||
outArguments.rightRemainingBytes = static_cast<uint32_t>((adjustedSize % outArguments.mainGroupSize) * dataTypeSize +
|
||||
middleSize % dataTypeSize);
|
||||
|
||||
if (outArguments.rightRemainingBytes > 0) {
|
||||
outArguments.rightOffset = outArguments.mainOffset + (middleSize - outArguments.rightRemainingBytes);
|
||||
}
|
||||
} else {
|
||||
size_t middleElSize = sizeof(uint32_t);
|
||||
size_t adjustedSize = dstSize / middleElSize;
|
||||
uint32_t groupSizeX = static_cast<uint32_t>(adjustedSize);
|
||||
uint32_t groupSizeY = 1, groupSizeZ = 1;
|
||||
kernel->suggestGroupSize(groupSizeX, groupSizeY, groupSizeZ, &groupSizeX, &groupSizeY, &groupSizeZ);
|
||||
outArguments.mainGroupSize = groupSizeX;
|
||||
|
||||
outArguments.groups = static_cast<uint32_t>(adjustedSize) / outArguments.mainGroupSize;
|
||||
outArguments.rightRemainingBytes = static_cast<uint32_t>((adjustedSize % outArguments.mainGroupSize) * middleElSize +
|
||||
dstSize % middleElSize);
|
||||
|
||||
size_t patternAllocationSize = alignUp(patternSize, MemoryConstants::cacheLineSize);
|
||||
outArguments.patternSizeInEls = static_cast<uint32_t>(patternAllocationSize / middleElSize);
|
||||
|
||||
if (outArguments.rightRemainingBytes > 0) {
|
||||
outArguments.rightOffset = outArguments.groups * outArguments.mainGroupSize * middleElSize;
|
||||
outArguments.patternOffsetRemainder = (outArguments.mainGroupSize * outArguments.groups & (outArguments.patternSizeInEls - 1)) * middleElSize;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace L0
|
||||
|
||||
@@ -246,7 +246,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelSplit(Kernel
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
void CommandListCoreFamily<gfxCoreFamily>::appendEventForProfilingAllWalkers(Event *event, bool beforeWalker) {
|
||||
void CommandListCoreFamily<gfxCoreFamily>::appendEventForProfilingAllWalkers(Event *event, bool beforeWalker, bool singlePacketEvent) {
|
||||
if (beforeWalker) {
|
||||
appendEventForProfiling(event, true, false);
|
||||
} else {
|
||||
|
||||
@@ -404,25 +404,31 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelSplit(Kernel
|
||||
Event *event,
|
||||
const CmdListKernelLaunchParams &launchParams) {
|
||||
if (event) {
|
||||
event->increaseKernelCount();
|
||||
if (this->pipeControlMultiKernelEventSync && launchParams.isKernelSplitOperation) {
|
||||
event = nullptr;
|
||||
} else {
|
||||
event->increaseKernelCount();
|
||||
}
|
||||
}
|
||||
return appendLaunchKernelWithParams(kernel, threadGroupDimensions, event, launchParams);
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
void CommandListCoreFamily<gfxCoreFamily>::appendEventForProfilingAllWalkers(Event *event, bool beforeWalker) {
|
||||
if (isCopyOnly()) {
|
||||
void CommandListCoreFamily<gfxCoreFamily>::appendEventForProfilingAllWalkers(Event *event, bool beforeWalker, bool singlePacketEvent) {
|
||||
if (isCopyOnly() || singlePacketEvent) {
|
||||
if (beforeWalker) {
|
||||
appendEventForProfiling(event, true, false);
|
||||
bool workloadPartition = setupTimestampEventForMultiTile(event);
|
||||
appendEventForProfiling(event, true, workloadPartition);
|
||||
} else {
|
||||
appendSignalEventPostWalker(event, false);
|
||||
bool workloadPartition = isTimestampEventForMultiTile(event);
|
||||
appendSignalEventPostWalker(event, workloadPartition);
|
||||
}
|
||||
} else {
|
||||
if (event) {
|
||||
if (beforeWalker) {
|
||||
event->zeroKernelCount();
|
||||
} else {
|
||||
if (getDcFlushRequired(!!event->signalScope)) {
|
||||
if (event->getKernelCount() > 1 && getDcFlushRequired(!!event->signalScope)) {
|
||||
programEventL3Flush<gfxCoreFamily>(event, this->device, this->partitionCount, this->commandContainer);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -62,7 +62,7 @@ struct BcsSplit {
|
||||
ze_result_t result = ZE_RESULT_SUCCESS;
|
||||
|
||||
if (hSignalEvent) {
|
||||
cmdList->appendEventForProfilingAllWalkers(Event::fromHandle(hSignalEvent), true);
|
||||
cmdList->appendEventForProfilingAllWalkers(Event::fromHandle(hSignalEvent), true, true);
|
||||
}
|
||||
|
||||
auto markerEventIndex = this->events.obtainForSplit(Context::fromHandle(cmdList->hContext), MemoryConstants::pageSize64k / sizeof(typename CommandListCoreFamilyImmediate<gfxCoreFamily>::GfxFamily::TimestampPacketType));
|
||||
@@ -86,10 +86,10 @@ struct BcsSplit {
|
||||
}
|
||||
|
||||
cmdList->addEventsToCmdList(static_cast<uint32_t>(this->cmdQs.size()), eventHandles.data());
|
||||
cmdList->appendEventForProfilingAllWalkers(this->events.marker[markerEventIndex], false);
|
||||
cmdList->appendEventForProfilingAllWalkers(this->events.marker[markerEventIndex], false, true);
|
||||
|
||||
if (hSignalEvent) {
|
||||
cmdList->appendEventForProfilingAllWalkers(Event::fromHandle(hSignalEvent), false);
|
||||
cmdList->appendEventForProfilingAllWalkers(Event::fromHandle(hSignalEvent), false, true);
|
||||
}
|
||||
|
||||
return result;
|
||||
|
||||
@@ -46,4 +46,11 @@ bool L0HwHelper::enableImmediateCmdListHeapSharing(const NEO::HardwareInfo &hwIn
|
||||
return platformSupport && cmdlistSupport;
|
||||
}
|
||||
|
||||
bool L0HwHelper::usePipeControlMultiKernelEventSync(const NEO::HardwareInfo &hwInfo) {
|
||||
if (NEO::DebugManager.flags.UsePipeControlMultiKernelEventSync.get() != -1) {
|
||||
return !!NEO::DebugManager.flags.UsePipeControlMultiKernelEventSync.get();
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
} // namespace L0
|
||||
|
||||
@@ -34,6 +34,7 @@ class L0HwHelper {
|
||||
static bool enablePipelineSelectStateTracking(const NEO::HardwareInfo &hwInfo);
|
||||
static bool enableStateComputeModeTracking(const NEO::HardwareInfo &hwInfo);
|
||||
static bool enableImmediateCmdListHeapSharing(const NEO::HardwareInfo &hwInfo, bool cmdlistSupport);
|
||||
static bool usePipeControlMultiKernelEventSync(const NEO::HardwareInfo &hwInfo);
|
||||
virtual void setAdditionalGroupProperty(ze_command_queue_group_properties_t &groupProperty, NEO::EngineGroupT &group) const = 0;
|
||||
virtual L0::Event *createEvent(L0::EventPool *eventPool, const ze_event_desc_t *desc, L0::Device *device) const = 0;
|
||||
|
||||
|
||||
@@ -65,6 +65,7 @@ struct WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>
|
||||
using BaseClass::initialize;
|
||||
using BaseClass::partitionCount;
|
||||
using BaseClass::patternAllocations;
|
||||
using BaseClass::pipeControlMultiKernelEventSync;
|
||||
using BaseClass::pipelineSelectStateTracking;
|
||||
using BaseClass::requiredStreamState;
|
||||
using BaseClass::stateComputeModeTracking;
|
||||
@@ -130,6 +131,7 @@ struct WhiteBox<L0::CommandListCoreFamilyImmediate<gfxCoreFamily>>
|
||||
using BaseClass::immediateCmdListHeapSharing;
|
||||
using BaseClass::isFlushTaskSubmissionEnabled;
|
||||
using BaseClass::partitionCount;
|
||||
using BaseClass::pipeControlMultiKernelEventSync;
|
||||
using BaseClass::pipelineSelectStateTracking;
|
||||
using BaseClass::requiredStreamState;
|
||||
using BaseClass::stateComputeModeTracking;
|
||||
@@ -143,6 +145,7 @@ struct MockCommandListImmediate : public CommandListCoreFamilyImmediate<gfxCoreF
|
||||
using BaseClass::containsAnyKernel;
|
||||
using BaseClass::immediateCmdListHeapSharing;
|
||||
using BaseClass::indirectAllocationsAllowed;
|
||||
using BaseClass::pipeControlMultiKernelEventSync;
|
||||
using BaseClass::requiredStreamState;
|
||||
};
|
||||
|
||||
@@ -422,8 +425,9 @@ class MockAppendMemoryCopy : public CommandListCoreFamily<gfxCoreFamily> {
|
||||
uint64_t srcOffset, uint64_t size,
|
||||
uint64_t elementSize, Builtin builtin,
|
||||
Event *signalEvent,
|
||||
bool isStateless),
|
||||
(dstPtr, dstPtrAlloc, dstOffset, srcPtr, srcPtrAlloc, srcOffset, size, elementSize, builtin, signalEvent, isStateless));
|
||||
bool isStateless,
|
||||
CmdListKernelLaunchParams &launchParams),
|
||||
(dstPtr, dstPtrAlloc, dstOffset, srcPtr, srcPtrAlloc, srcOffset, size, elementSize, builtin, signalEvent, isStateless, launchParams));
|
||||
|
||||
ADDMETHOD_NOBASE(appendMemoryCopyBlit, ze_result_t, ZE_RESULT_SUCCESS,
|
||||
(uintptr_t dstPtr,
|
||||
|
||||
@@ -50,7 +50,8 @@ class MockCommandListHw : public WhiteBox<::L0::CommandListCoreFamily<gfxCoreFam
|
||||
uint64_t elementSize,
|
||||
Builtin builtin,
|
||||
Event *signalEvent,
|
||||
bool isStateless) override {
|
||||
bool isStateless,
|
||||
CmdListKernelLaunchParams &launchParams) override {
|
||||
appendMemoryCopyKernelWithGACalledTimes++;
|
||||
if (isStateless) {
|
||||
appendMemoryCopyKernelWithGAStatelessCalledTimes++;
|
||||
|
||||
@@ -575,7 +575,7 @@ HWTEST2_F(CommandListTest, givenComputeCommandListWhenMemoryCopyInExternalHostAl
|
||||
|
||||
commandList->appendMemoryCopy(dstPtr, srcPtr, 8, nullptr, 0, nullptr);
|
||||
EXPECT_TRUE(commandList->usedKernelLaunchParams.isBuiltInKernel);
|
||||
EXPECT_TRUE(commandList->usedKernelLaunchParams.isKernelSplitOperation);
|
||||
EXPECT_FALSE(commandList->usedKernelLaunchParams.isKernelSplitOperation);
|
||||
EXPECT_TRUE(commandList->usedKernelLaunchParams.isDestinationAllocationInSystemMemory);
|
||||
}
|
||||
|
||||
@@ -593,7 +593,7 @@ HWTEST2_F(CommandListTest, givenComputeCommandListWhenMemoryCopyInUsmHostAllocat
|
||||
|
||||
commandList->appendMemoryCopy(dstBuffer, srcPtr, 8, nullptr, 0, nullptr);
|
||||
EXPECT_TRUE(commandList->usedKernelLaunchParams.isBuiltInKernel);
|
||||
EXPECT_TRUE(commandList->usedKernelLaunchParams.isKernelSplitOperation);
|
||||
EXPECT_FALSE(commandList->usedKernelLaunchParams.isKernelSplitOperation);
|
||||
EXPECT_TRUE(commandList->usedKernelLaunchParams.isDestinationAllocationInSystemMemory);
|
||||
|
||||
context->freeMem(dstBuffer);
|
||||
@@ -617,7 +617,7 @@ HWTEST2_F(CommandListTest, givenComputeCommandListWhenMemoryCopyInUsmDeviceAlloc
|
||||
|
||||
commandList->appendMemoryCopy(dstBuffer, srcPtr, 8, nullptr, 0, nullptr);
|
||||
EXPECT_TRUE(commandList->usedKernelLaunchParams.isBuiltInKernel);
|
||||
EXPECT_TRUE(commandList->usedKernelLaunchParams.isKernelSplitOperation);
|
||||
EXPECT_FALSE(commandList->usedKernelLaunchParams.isKernelSplitOperation);
|
||||
EXPECT_FALSE(commandList->usedKernelLaunchParams.isDestinationAllocationInSystemMemory);
|
||||
|
||||
context->freeMem(dstBuffer);
|
||||
@@ -638,7 +638,12 @@ HWTEST2_F(CommandListTest, givenComputeCommandListWhenMemoryFillInUsmHostThenBui
|
||||
|
||||
commandList->appendMemoryFill(dstBuffer, pattern, patternSize, allocSize, nullptr, 0, nullptr);
|
||||
EXPECT_TRUE(commandList->usedKernelLaunchParams.isBuiltInKernel);
|
||||
EXPECT_TRUE(commandList->usedKernelLaunchParams.isKernelSplitOperation);
|
||||
EXPECT_FALSE(commandList->usedKernelLaunchParams.isKernelSplitOperation);
|
||||
EXPECT_TRUE(commandList->usedKernelLaunchParams.isDestinationAllocationInSystemMemory);
|
||||
|
||||
commandList->appendMemoryFill(dstBuffer, pattern, 1, allocSize, nullptr, 0, nullptr);
|
||||
EXPECT_TRUE(commandList->usedKernelLaunchParams.isBuiltInKernel);
|
||||
EXPECT_FALSE(commandList->usedKernelLaunchParams.isKernelSplitOperation);
|
||||
EXPECT_TRUE(commandList->usedKernelLaunchParams.isDestinationAllocationInSystemMemory);
|
||||
|
||||
context->freeMem(dstBuffer);
|
||||
@@ -663,6 +668,43 @@ HWTEST2_F(CommandListTest, givenComputeCommandListWhenMemoryFillInUsmDeviceThenB
|
||||
|
||||
commandList->appendMemoryFill(dstBuffer, pattern, patternSize, size, nullptr, 0, nullptr);
|
||||
EXPECT_TRUE(commandList->usedKernelLaunchParams.isBuiltInKernel);
|
||||
EXPECT_FALSE(commandList->usedKernelLaunchParams.isKernelSplitOperation);
|
||||
EXPECT_FALSE(commandList->usedKernelLaunchParams.isDestinationAllocationInSystemMemory);
|
||||
|
||||
commandList->appendMemoryFill(dstBuffer, pattern, 1, size, nullptr, 0, nullptr);
|
||||
EXPECT_TRUE(commandList->usedKernelLaunchParams.isBuiltInKernel);
|
||||
EXPECT_FALSE(commandList->usedKernelLaunchParams.isKernelSplitOperation);
|
||||
EXPECT_FALSE(commandList->usedKernelLaunchParams.isDestinationAllocationInSystemMemory);
|
||||
|
||||
context->freeMem(dstBuffer);
|
||||
}
|
||||
|
||||
HWTEST2_F(CommandListTest, givenComputeCommandListWhenMemoryFillRequiresMultiKernelsThenSplitFlagIsSet, IsAtLeastSkl) {
|
||||
auto commandList = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>>();
|
||||
commandList->initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
|
||||
constexpr size_t patternSize = 8;
|
||||
uint8_t pattern[patternSize] = {1, 2, 3, 4};
|
||||
|
||||
constexpr size_t size = 4096u;
|
||||
constexpr size_t alignment = 4096u;
|
||||
void *dstBuffer = nullptr;
|
||||
|
||||
ze_device_mem_alloc_desc_t deviceDesc = {};
|
||||
auto result = context->allocDeviceMem(device->toHandle(),
|
||||
&deviceDesc,
|
||||
size, alignment, &dstBuffer);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
|
||||
constexpr size_t fillSize = size - 1;
|
||||
|
||||
commandList->appendMemoryFill(dstBuffer, pattern, patternSize, fillSize, nullptr, 0, nullptr);
|
||||
EXPECT_TRUE(commandList->usedKernelLaunchParams.isBuiltInKernel);
|
||||
EXPECT_TRUE(commandList->usedKernelLaunchParams.isKernelSplitOperation);
|
||||
EXPECT_FALSE(commandList->usedKernelLaunchParams.isDestinationAllocationInSystemMemory);
|
||||
|
||||
commandList->appendMemoryFill(dstBuffer, pattern, 1, fillSize, nullptr, 0, nullptr);
|
||||
EXPECT_TRUE(commandList->usedKernelLaunchParams.isBuiltInKernel);
|
||||
EXPECT_TRUE(commandList->usedKernelLaunchParams.isKernelSplitOperation);
|
||||
EXPECT_FALSE(commandList->usedKernelLaunchParams.isDestinationAllocationInSystemMemory);
|
||||
|
||||
|
||||
@@ -89,6 +89,8 @@ class AppendFillFixture : public DeviceFixture {
|
||||
delete[] dstPtr;
|
||||
}
|
||||
|
||||
DebugManagerStateRestore restorer;
|
||||
|
||||
std::unique_ptr<Mock<MockDriverFillHandle>> driverHandle;
|
||||
NEO::MockDevice *neoDevice = nullptr;
|
||||
L0::Device *device = nullptr;
|
||||
@@ -108,8 +110,6 @@ struct MultiTileAppendFillFixture : public AppendFillFixture {
|
||||
DebugManager.flags.EnableImplicitScaling.set(1);
|
||||
AppendFillFixture::setUp();
|
||||
}
|
||||
|
||||
DebugManagerStateRestore restorer;
|
||||
};
|
||||
|
||||
using AppendFillTest = Test<AppendFillFixture>;
|
||||
@@ -480,203 +480,612 @@ HWTEST2_F(AppendFillTest,
|
||||
false);
|
||||
}
|
||||
|
||||
HWTEST2_F(AppendFillTest,
|
||||
givenCallToAppendMemoryFillWithImmediateValueWhenTimestampEventUsesComputeWalkerPostSyncThenSeparateKernelsUsesPostSyncProfiling, IsAtLeastXeHpCore) {
|
||||
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
|
||||
using COMPUTE_WALKER = typename GfxFamily::COMPUTE_WALKER;
|
||||
using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA;
|
||||
template <int32_t usePipeControlMultiPacketEventSync>
|
||||
struct AppendFillMultiPacketEventFixture : public AppendFillFixture {
|
||||
void setUp() {
|
||||
DebugManager.flags.UsePipeControlMultiKernelEventSync.set(usePipeControlMultiPacketEventSync);
|
||||
AppendFillFixture::setUp();
|
||||
}
|
||||
|
||||
ze_event_pool_desc_t eventPoolDesc = {};
|
||||
eventPoolDesc.count = 1;
|
||||
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
void testAppendMemoryFillManyImmediateKernels() {
|
||||
using FamilyType = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
|
||||
using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER;
|
||||
using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA;
|
||||
using OPERATION = typename POSTSYNC_DATA::OPERATION;
|
||||
|
||||
ze_event_desc_t eventDesc = {};
|
||||
eventDesc.index = 0;
|
||||
ze_event_pool_desc_t eventPoolDesc = {};
|
||||
eventPoolDesc.count = 1;
|
||||
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
|
||||
|
||||
ze_result_t result = ZE_RESULT_SUCCESS;
|
||||
auto eventPool = std::unique_ptr<L0::EventPool>(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
auto event = std::unique_ptr<L0::Event>(L0::Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
|
||||
ze_event_desc_t eventDesc = {};
|
||||
eventDesc.index = 0;
|
||||
|
||||
uint64_t firstKernelEventAddress = event->getGpuAddress(device);
|
||||
uint64_t secondKernelEventAddress = event->getGpuAddress(device) + event->getSinglePacketSize();
|
||||
ze_result_t result = ZE_RESULT_SUCCESS;
|
||||
auto eventPool = std::unique_ptr<L0::EventPool>(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
auto event = std::unique_ptr<L0::Event>(L0::Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
|
||||
|
||||
auto commandList = std::make_unique<WhiteBox<MockCommandList<gfxCoreFamily>>>();
|
||||
commandList->initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
auto &commandContainer = commandList->commandContainer;
|
||||
uint64_t firstKernelEventAddress = postSyncAddressZero ? 0 : event->getGpuAddress(device);
|
||||
uint64_t secondKernelEventAddress = postSyncAddressZero ? 0 : event->getGpuAddress(device) + event->getSinglePacketSize();
|
||||
|
||||
size_t usedBefore = commandContainer.getCommandStream()->getUsed();
|
||||
result = commandList->appendMemoryFill(immediateDstPtr, &immediatePattern,
|
||||
sizeof(immediatePattern),
|
||||
immediateAllocSize, event->toHandle(), 0, nullptr);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
size_t usedAfter = commandContainer.getCommandStream()->getUsed();
|
||||
auto commandList = std::make_unique<WhiteBox<MockCommandList<gfxCoreFamily>>>();
|
||||
commandList->initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
auto &commandContainer = commandList->commandContainer;
|
||||
|
||||
EXPECT_EQ(2u, event->getPacketsInUse());
|
||||
EXPECT_EQ(2u, event->getKernelCount());
|
||||
size_t usedBefore = commandContainer.getCommandStream()->getUsed();
|
||||
result = commandList->appendMemoryFill(immediateDstPtr, &immediatePattern,
|
||||
sizeof(immediatePattern),
|
||||
immediateAllocSize, event->toHandle(), 0, nullptr);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
size_t usedAfter = commandContainer.getCommandStream()->getUsed();
|
||||
|
||||
GenCmdList cmdList;
|
||||
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
|
||||
cmdList,
|
||||
ptrOffset(commandContainer.getCommandStream()->getCpuBase(), usedBefore),
|
||||
usedAfter - usedBefore));
|
||||
EXPECT_EQ(expectedPacketsInUse, event->getPacketsInUse());
|
||||
EXPECT_EQ(expectedKernelCount, event->getKernelCount());
|
||||
|
||||
auto itorWalkers = findAll<COMPUTE_WALKER *>(cmdList.begin(), cmdList.end());
|
||||
ASSERT_EQ(2u, itorWalkers.size());
|
||||
auto firstWalker = itorWalkers[0];
|
||||
auto secondWalker = itorWalkers[1];
|
||||
GenCmdList cmdList;
|
||||
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
|
||||
cmdList,
|
||||
ptrOffset(commandContainer.getCommandStream()->getCpuBase(), usedBefore),
|
||||
usedAfter - usedBefore));
|
||||
|
||||
auto walkerCmd = genCmdCast<COMPUTE_WALKER *>(*firstWalker);
|
||||
EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation());
|
||||
EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
|
||||
auto itorWalkers = findAll<COMPUTE_WALKER *>(cmdList.begin(), cmdList.end());
|
||||
ASSERT_EQ(2u, itorWalkers.size());
|
||||
auto firstWalker = itorWalkers[0];
|
||||
auto secondWalker = itorWalkers[1];
|
||||
|
||||
walkerCmd = genCmdCast<COMPUTE_WALKER *>(*secondWalker);
|
||||
EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation());
|
||||
EXPECT_EQ(secondKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
|
||||
auto walkerCmd = genCmdCast<COMPUTE_WALKER *>(*firstWalker);
|
||||
EXPECT_EQ(static_cast<OPERATION>(expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation());
|
||||
EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
|
||||
|
||||
walkerCmd = genCmdCast<COMPUTE_WALKER *>(*secondWalker);
|
||||
EXPECT_EQ(static_cast<OPERATION>(expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation());
|
||||
EXPECT_EQ(secondKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
void testAppendMemoryFillManyKernels() {
|
||||
using FamilyType = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
|
||||
using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER;
|
||||
using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA;
|
||||
using OPERATION = typename POSTSYNC_DATA::OPERATION;
|
||||
|
||||
ze_event_pool_desc_t eventPoolDesc = {};
|
||||
eventPoolDesc.count = 1;
|
||||
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
|
||||
|
||||
ze_event_desc_t eventDesc = {};
|
||||
eventDesc.index = 0;
|
||||
|
||||
ze_result_t result = ZE_RESULT_SUCCESS;
|
||||
auto eventPool = std::unique_ptr<L0::EventPool>(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
auto event = std::unique_ptr<L0::Event>(L0::Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
|
||||
|
||||
uint64_t firstKernelEventAddress = postSyncAddressZero ? 0 : event->getGpuAddress(device);
|
||||
uint64_t secondKernelEventAddress = postSyncAddressZero ? 0 : event->getGpuAddress(device) + event->getSinglePacketSize();
|
||||
|
||||
auto commandList = std::make_unique<WhiteBox<MockCommandList<gfxCoreFamily>>>();
|
||||
commandList->initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
auto &commandContainer = commandList->commandContainer;
|
||||
|
||||
size_t usedBefore = commandContainer.getCommandStream()->getUsed();
|
||||
result = commandList->appendMemoryFill(dstPtr, pattern, patternSize, allocSize, event->toHandle(), 0, nullptr);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
size_t usedAfter = commandContainer.getCommandStream()->getUsed();
|
||||
|
||||
EXPECT_EQ(expectedPacketsInUse, event->getPacketsInUse());
|
||||
EXPECT_EQ(expectedKernelCount, event->getKernelCount());
|
||||
|
||||
GenCmdList cmdList;
|
||||
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
|
||||
cmdList,
|
||||
ptrOffset(commandContainer.getCommandStream()->getCpuBase(), usedBefore),
|
||||
usedAfter - usedBefore));
|
||||
|
||||
auto itorWalkers = findAll<COMPUTE_WALKER *>(cmdList.begin(), cmdList.end());
|
||||
ASSERT_EQ(2u, itorWalkers.size());
|
||||
auto firstWalker = itorWalkers[0];
|
||||
auto secondWalker = itorWalkers[1];
|
||||
|
||||
auto walkerCmd = genCmdCast<COMPUTE_WALKER *>(*firstWalker);
|
||||
EXPECT_EQ(static_cast<OPERATION>(expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation());
|
||||
EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
|
||||
|
||||
walkerCmd = genCmdCast<COMPUTE_WALKER *>(*secondWalker);
|
||||
EXPECT_EQ(static_cast<OPERATION>(expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation());
|
||||
EXPECT_EQ(secondKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
void testAppendMemoryFillSingleKernel() {
|
||||
using FamilyType = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
|
||||
using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER;
|
||||
using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA;
|
||||
using OPERATION = typename POSTSYNC_DATA::OPERATION;
|
||||
|
||||
ze_event_pool_desc_t eventPoolDesc = {};
|
||||
eventPoolDesc.count = 1;
|
||||
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
|
||||
|
||||
ze_event_desc_t eventDesc = {};
|
||||
eventDesc.index = 0;
|
||||
|
||||
ze_result_t result = ZE_RESULT_SUCCESS;
|
||||
auto eventPool = std::unique_ptr<L0::EventPool>(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
auto event = std::unique_ptr<L0::Event>(L0::Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
|
||||
|
||||
auto commandList = std::make_unique<WhiteBox<MockCommandList<gfxCoreFamily>>>();
|
||||
commandList->initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
|
||||
int pattern = 0;
|
||||
const size_t size = 1024;
|
||||
uint8_t array[size] = {};
|
||||
|
||||
auto &commandContainer = commandList->commandContainer;
|
||||
size_t usedBefore = commandContainer.getCommandStream()->getUsed();
|
||||
result = commandList->appendMemoryFill(array, &pattern, 1, size, event->toHandle(), 0, nullptr);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
size_t usedAfter = commandContainer.getCommandStream()->getUsed();
|
||||
|
||||
EXPECT_EQ(expectedPacketsInUse, event->getPacketsInUse());
|
||||
EXPECT_EQ(expectedKernelCount, event->getKernelCount());
|
||||
|
||||
uint64_t firstKernelEventAddress = postSyncAddressZero ? 0 : event->getGpuAddress(device);
|
||||
|
||||
GenCmdList cmdList;
|
||||
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
|
||||
cmdList,
|
||||
ptrOffset(commandContainer.getCommandStream()->getCpuBase(), usedBefore),
|
||||
usedAfter - usedBefore));
|
||||
|
||||
auto itorWalkers = findAll<COMPUTE_WALKER *>(cmdList.begin(), cmdList.end());
|
||||
ASSERT_EQ(1u, itorWalkers.size());
|
||||
auto firstWalker = itorWalkers[0];
|
||||
|
||||
auto walkerCmd = genCmdCast<COMPUTE_WALKER *>(*firstWalker);
|
||||
EXPECT_EQ(static_cast<OPERATION>(expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation());
|
||||
EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
void testAppendMemoryFillSingleKernelAndL3Flush() {
|
||||
using FamilyType = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
|
||||
using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER;
|
||||
using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA;
|
||||
using OPERATION = typename POSTSYNC_DATA::OPERATION;
|
||||
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
|
||||
using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION;
|
||||
|
||||
ze_event_pool_desc_t eventPoolDesc = {};
|
||||
eventPoolDesc.count = 1;
|
||||
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
|
||||
|
||||
ze_event_desc_t eventDesc = {};
|
||||
eventDesc.index = 0;
|
||||
eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
|
||||
|
||||
ze_result_t result = ZE_RESULT_SUCCESS;
|
||||
auto eventPool = std::unique_ptr<L0::EventPool>(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
auto event = std::unique_ptr<L0::Event>(L0::Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
|
||||
|
||||
auto commandList = std::make_unique<WhiteBox<MockCommandList<gfxCoreFamily>>>();
|
||||
commandList->initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
|
||||
int pattern = 0;
|
||||
const size_t size = 1024;
|
||||
uint8_t array[size] = {};
|
||||
|
||||
auto &commandContainer = commandList->commandContainer;
|
||||
size_t usedBefore = commandContainer.getCommandStream()->getUsed();
|
||||
result = commandList->appendMemoryFill(array, &pattern, 1, size, event->toHandle(), 0, nullptr);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
size_t usedAfter = commandContainer.getCommandStream()->getUsed();
|
||||
|
||||
EXPECT_EQ(expectedPacketsInUse, event->getPacketsInUse());
|
||||
EXPECT_EQ(expectedKernelCount, event->getKernelCount());
|
||||
|
||||
uint64_t firstKernelEventAddress = postSyncAddressZero ? 0 : event->getGpuAddress(device);
|
||||
|
||||
GenCmdList cmdList;
|
||||
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
|
||||
cmdList,
|
||||
ptrOffset(commandContainer.getCommandStream()->getCpuBase(), usedBefore),
|
||||
usedAfter - usedBefore));
|
||||
|
||||
auto itorWalkers = findAll<COMPUTE_WALKER *>(cmdList.begin(), cmdList.end());
|
||||
ASSERT_EQ(1u, itorWalkers.size());
|
||||
auto firstWalker = itorWalkers[0];
|
||||
|
||||
auto walkerCmd = genCmdCast<COMPUTE_WALKER *>(*firstWalker);
|
||||
EXPECT_EQ(static_cast<OPERATION>(expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation());
|
||||
EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
|
||||
|
||||
uint64_t l3FlushPostSyncAddress = firstKernelEventAddress + event->getSinglePacketSize();
|
||||
if (event->isUsingContextEndOffset()) {
|
||||
l3FlushPostSyncAddress += event->getContextEndOffset();
|
||||
}
|
||||
|
||||
auto itorPipeControls = findAll<PIPE_CONTROL *>(firstWalker, cmdList.end());
|
||||
|
||||
uint32_t postSyncPipeControls = 0;
|
||||
uint32_t dcFlushFound = 0;
|
||||
for (auto it : itorPipeControls) {
|
||||
auto cmd = genCmdCast<PIPE_CONTROL *>(*it);
|
||||
if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) {
|
||||
postSyncPipeControls++;
|
||||
EXPECT_EQ(l3FlushPostSyncAddress, NEO::UnitTestHelper<FamilyType>::getPipeControlPostSyncAddress(*cmd));
|
||||
EXPECT_EQ(Event::STATE_SIGNALED, cmd->getImmediateData());
|
||||
}
|
||||
if (cmd->getDcFlushEnable()) {
|
||||
dcFlushFound++;
|
||||
}
|
||||
}
|
||||
EXPECT_EQ(expectedPostSyncPipeControls, postSyncPipeControls);
|
||||
EXPECT_EQ(1u, dcFlushFound);
|
||||
}
|
||||
|
||||
uint32_t expectedPacketsInUse = 0;
|
||||
uint32_t expectedKernelCount = 0;
|
||||
uint32_t expectedWalkerPostSyncOp = 0;
|
||||
uint32_t expectedPostSyncPipeControls = 0;
|
||||
bool postSyncAddressZero = false;
|
||||
};
|
||||
|
||||
using AppendFillMultiPacketEventTest = Test<AppendFillMultiPacketEventFixture<0>>;
|
||||
using AppendFillSinglePacketEventTest = Test<AppendFillMultiPacketEventFixture<1>>;
|
||||
|
||||
HWTEST2_F(AppendFillMultiPacketEventTest,
|
||||
givenCallToAppendMemoryFillWithImmediateValueWhenTimestampEventUsesComputeWalkerPostSyncThenSeparateKernelsUsesPostSyncProfiling,
|
||||
IsAtLeastXeHpCore) {
|
||||
expectedPacketsInUse = 2;
|
||||
expectedKernelCount = 2;
|
||||
expectedWalkerPostSyncOp = 3;
|
||||
postSyncAddressZero = false;
|
||||
|
||||
testAppendMemoryFillManyImmediateKernels<gfxCoreFamily>();
|
||||
}
|
||||
|
||||
HWTEST2_F(AppendFillTest,
|
||||
givenCallToAppendMemoryFillWhenTimestampEventUsesComputeWalkerPostSyncThenSeparateKernelsUsesPostSyncProfiling, IsAtLeastXeHpCore) {
|
||||
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
|
||||
using COMPUTE_WALKER = typename GfxFamily::COMPUTE_WALKER;
|
||||
using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA;
|
||||
HWTEST2_F(AppendFillMultiPacketEventTest,
|
||||
givenCallToAppendMemoryFillWhenTimestampEventUsesComputeWalkerPostSyncThenSeparateKernelsUsesWalkerPostSyncProfiling,
|
||||
IsAtLeastXeHpCore) {
|
||||
expectedPacketsInUse = 2;
|
||||
expectedKernelCount = 2;
|
||||
expectedWalkerPostSyncOp = 3;
|
||||
postSyncAddressZero = false;
|
||||
|
||||
ze_event_pool_desc_t eventPoolDesc = {};
|
||||
eventPoolDesc.count = 1;
|
||||
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
|
||||
|
||||
ze_event_desc_t eventDesc = {};
|
||||
eventDesc.index = 0;
|
||||
|
||||
ze_result_t result = ZE_RESULT_SUCCESS;
|
||||
auto eventPool = std::unique_ptr<L0::EventPool>(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
auto event = std::unique_ptr<L0::Event>(L0::Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
|
||||
|
||||
uint64_t firstKernelEventAddress = event->getGpuAddress(device);
|
||||
uint64_t secondKernelEventAddress = event->getGpuAddress(device) + event->getSinglePacketSize();
|
||||
|
||||
auto commandList = std::make_unique<WhiteBox<MockCommandList<gfxCoreFamily>>>();
|
||||
commandList->initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
auto &commandContainer = commandList->commandContainer;
|
||||
|
||||
size_t usedBefore = commandContainer.getCommandStream()->getUsed();
|
||||
result = commandList->appendMemoryFill(dstPtr, pattern, patternSize, allocSize, event->toHandle(), 0, nullptr);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
size_t usedAfter = commandContainer.getCommandStream()->getUsed();
|
||||
|
||||
EXPECT_EQ(2u, event->getPacketsInUse());
|
||||
EXPECT_EQ(2u, event->getKernelCount());
|
||||
|
||||
GenCmdList cmdList;
|
||||
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
|
||||
cmdList,
|
||||
ptrOffset(commandContainer.getCommandStream()->getCpuBase(), usedBefore),
|
||||
usedAfter - usedBefore));
|
||||
|
||||
auto itorWalkers = findAll<COMPUTE_WALKER *>(cmdList.begin(), cmdList.end());
|
||||
ASSERT_EQ(2u, itorWalkers.size());
|
||||
auto firstWalker = itorWalkers[0];
|
||||
auto secondWalker = itorWalkers[1];
|
||||
|
||||
auto walkerCmd = genCmdCast<COMPUTE_WALKER *>(*firstWalker);
|
||||
EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation());
|
||||
EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
|
||||
|
||||
walkerCmd = genCmdCast<COMPUTE_WALKER *>(*secondWalker);
|
||||
EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation());
|
||||
EXPECT_EQ(secondKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
|
||||
testAppendMemoryFillManyKernels<gfxCoreFamily>();
|
||||
}
|
||||
|
||||
HWTEST2_F(MultiTileAppendFillTest,
|
||||
givenMultiTileCmdListCallToAppendMemoryFillWhenSignalScopeTimestampEventUsesComputeWalkerPostSyncThenSeparateKernelsUsesPostSyncProfilingAndSingleDcFlushWithPostSync, IsAtLeastXeHpCore) {
|
||||
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
|
||||
using COMPUTE_WALKER = typename GfxFamily::COMPUTE_WALKER;
|
||||
using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA;
|
||||
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
|
||||
using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION;
|
||||
HWTEST2_F(AppendFillMultiPacketEventTest,
|
||||
givenAppendMemoryFillUsingSinglePacketEventWhenPatternDispatchOneKernelThenUseComputeWalkerPostSync,
|
||||
IsAtLeastXeHpCore) {
|
||||
expectedPacketsInUse = 1;
|
||||
expectedKernelCount = 1;
|
||||
expectedWalkerPostSyncOp = 3;
|
||||
postSyncAddressZero = false;
|
||||
|
||||
ze_event_pool_desc_t eventPoolDesc = {};
|
||||
eventPoolDesc.count = 1;
|
||||
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
|
||||
testAppendMemoryFillSingleKernel<gfxCoreFamily>();
|
||||
}
|
||||
|
||||
ze_event_desc_t eventDesc = {};
|
||||
eventDesc.index = 0;
|
||||
eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
|
||||
HWTEST2_F(AppendFillMultiPacketEventTest,
|
||||
givenAppendMemoryFillUsingSinglePacketEventWhenPatternDispatchOneKernelThenUseComputeWalkerPostSyncAndL3PostSync,
|
||||
IsXeHpOrXeHpgCore) {
|
||||
expectedPacketsInUse = 2;
|
||||
expectedKernelCount = 1;
|
||||
expectedWalkerPostSyncOp = 3;
|
||||
expectedPostSyncPipeControls = 1;
|
||||
postSyncAddressZero = false;
|
||||
|
||||
ze_result_t result = ZE_RESULT_SUCCESS;
|
||||
auto eventPool = std::unique_ptr<L0::EventPool>(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
auto event = std::unique_ptr<L0::Event>(L0::Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
|
||||
testAppendMemoryFillSingleKernelAndL3Flush<gfxCoreFamily>();
|
||||
}
|
||||
|
||||
uint64_t firstKernelEventAddress = event->getGpuAddress(device);
|
||||
uint64_t secondKernelEventAddress = event->getGpuAddress(device) + 2 * event->getSinglePacketSize();
|
||||
HWTEST2_F(AppendFillSinglePacketEventTest,
|
||||
givenCallToAppendMemoryFillWithImmediateValueWhenTimestampEventUsesRegisterPostSyncThenSeparateKernelsNotUsesWalkerPostSyncProfiling,
|
||||
IsAtLeastXeHpCore) {
|
||||
expectedPacketsInUse = 1;
|
||||
expectedKernelCount = 1;
|
||||
expectedWalkerPostSyncOp = 0;
|
||||
postSyncAddressZero = true;
|
||||
|
||||
auto commandList = std::make_unique<WhiteBox<MockCommandList<gfxCoreFamily>>>();
|
||||
commandList->initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
EXPECT_EQ(2u, commandList->partitionCount);
|
||||
auto &commandContainer = commandList->commandContainer;
|
||||
testAppendMemoryFillManyImmediateKernels<gfxCoreFamily>();
|
||||
}
|
||||
|
||||
size_t usedBefore = commandContainer.getCommandStream()->getUsed();
|
||||
result = commandList->appendMemoryFill(dstPtr, pattern, patternSize, allocSize, event->toHandle(), 0, nullptr);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
size_t usedAfter = commandContainer.getCommandStream()->getUsed();
|
||||
HWTEST2_F(AppendFillSinglePacketEventTest,
|
||||
givenCallToAppendMemoryFillWhenTimestampEventUsesRegisterPostSyncThenSeparateKernelsNotUsesWalkerPostSyncProfiling,
|
||||
IsAtLeastXeHpCore) {
|
||||
expectedPacketsInUse = 1;
|
||||
expectedKernelCount = 1;
|
||||
expectedWalkerPostSyncOp = 0;
|
||||
postSyncAddressZero = true;
|
||||
|
||||
// two kernels and each kernel uses two packets (for two tiles), in total 4
|
||||
uint32_t expectedPacketsInUse = 4;
|
||||
testAppendMemoryFillManyKernels<gfxCoreFamily>();
|
||||
}
|
||||
|
||||
uint32_t expectedDcFlush = 0;
|
||||
HWTEST2_F(AppendFillSinglePacketEventTest,
|
||||
givenAppendMemoryFillUsingSinglePacketEventWhenPatternDispatchOneKernelThenUseComputeWalkerPostSync,
|
||||
IsAtLeastXeHpCore) {
|
||||
expectedPacketsInUse = 1;
|
||||
expectedKernelCount = 1;
|
||||
expectedWalkerPostSyncOp = 3;
|
||||
postSyncAddressZero = false;
|
||||
|
||||
testAppendMemoryFillSingleKernel<gfxCoreFamily>();
|
||||
}
|
||||
|
||||
HWTEST2_F(AppendFillSinglePacketEventTest,
|
||||
givenAppendMemoryFillUsingSinglePacketEventWhenPatternDispatchOneKernelThenUseComputeWalkerPostSyncAndL3PostSync,
|
||||
IsXeHpOrXeHpgCore) {
|
||||
expectedPacketsInUse = 2;
|
||||
expectedKernelCount = 1;
|
||||
expectedWalkerPostSyncOp = 3;
|
||||
expectedPostSyncPipeControls = 1;
|
||||
postSyncAddressZero = false;
|
||||
|
||||
testAppendMemoryFillSingleKernelAndL3Flush<gfxCoreFamily>();
|
||||
}
|
||||
|
||||
template <int32_t usePipeControlMultiPacketEventSync>
|
||||
struct MultiTileAppendFillMultiPacketEventFixture : public MultiTileAppendFillFixture {
|
||||
void setUp() {
|
||||
DebugManager.flags.UsePipeControlMultiKernelEventSync.set(usePipeControlMultiPacketEventSync);
|
||||
MultiTileAppendFillFixture::setUp();
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
void testAppendMemoryFillManyKernels(ze_event_pool_flags_t eventPoolFlags) {
|
||||
using FamilyType = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
|
||||
using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER;
|
||||
using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA;
|
||||
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
|
||||
using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION;
|
||||
using OPERATION = typename POSTSYNC_DATA::OPERATION;
|
||||
|
||||
ze_event_pool_desc_t eventPoolDesc = {};
|
||||
eventPoolDesc.count = 1;
|
||||
eventPoolDesc.flags = eventPoolFlags;
|
||||
|
||||
ze_event_desc_t eventDesc = {};
|
||||
eventDesc.index = 0;
|
||||
eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
|
||||
|
||||
ze_result_t result = ZE_RESULT_SUCCESS;
|
||||
auto eventPool = std::unique_ptr<L0::EventPool>(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
auto event = std::unique_ptr<L0::Event>(L0::Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
|
||||
|
||||
uint64_t firstKernelEventAddress = postSyncAddressZero ? 0 : event->getGpuAddress(device);
|
||||
uint64_t secondKernelEventAddress = postSyncAddressZero ? 0 : event->getGpuAddress(device) + 2 * event->getSinglePacketSize();
|
||||
|
||||
auto commandList = std::make_unique<WhiteBox<MockCommandList<gfxCoreFamily>>>();
|
||||
commandList->initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
EXPECT_EQ(2u, commandList->partitionCount);
|
||||
auto &commandContainer = commandList->commandContainer;
|
||||
|
||||
size_t usedBefore = commandContainer.getCommandStream()->getUsed();
|
||||
result = commandList->appendMemoryFill(dstPtr, pattern, patternSize, allocSize, event->toHandle(), 0, nullptr);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
size_t usedAfter = commandContainer.getCommandStream()->getUsed();
|
||||
|
||||
uint32_t expectedDcFlush = 0;
|
||||
|
||||
if (NEO::MemorySynchronizationCommands<FamilyType>::getDcFlushEnable(true, device->getHwInfo())) {
|
||||
// 1st dc flush after cross-tile sync, 2nd dc flush for signal scope event
|
||||
expectedDcFlush = 2;
|
||||
}
|
||||
|
||||
EXPECT_EQ(expectedPacketsInUse, event->getPacketsInUse());
|
||||
EXPECT_EQ(expectedKernelCount, event->getKernelCount());
|
||||
|
||||
GenCmdList cmdList;
|
||||
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
|
||||
cmdList,
|
||||
ptrOffset(commandContainer.getCommandStream()->getCpuBase(), usedBefore),
|
||||
usedAfter - usedBefore));
|
||||
|
||||
auto itorWalkers = findAll<COMPUTE_WALKER *>(cmdList.begin(), cmdList.end());
|
||||
ASSERT_EQ(2u, itorWalkers.size());
|
||||
auto firstWalker = itorWalkers[0];
|
||||
auto secondWalker = itorWalkers[1];
|
||||
|
||||
auto walkerCmd = genCmdCast<COMPUTE_WALKER *>(*firstWalker);
|
||||
EXPECT_EQ(static_cast<OPERATION>(expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation());
|
||||
EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
|
||||
|
||||
walkerCmd = genCmdCast<COMPUTE_WALKER *>(*secondWalker);
|
||||
EXPECT_EQ(static_cast<OPERATION>(expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation());
|
||||
EXPECT_EQ(secondKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
|
||||
|
||||
auto itorPipeControls = findAll<PIPE_CONTROL *>(secondWalker, cmdList.end());
|
||||
|
||||
uint32_t postSyncPipeControls = 0;
|
||||
uint32_t dcFlushFound = 0;
|
||||
|
||||
for (auto it : itorPipeControls) {
|
||||
auto cmd = genCmdCast<PIPE_CONTROL *>(*it);
|
||||
if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) {
|
||||
EXPECT_EQ(Event::STATE_SIGNALED, cmd->getImmediateData());
|
||||
postSyncPipeControls++;
|
||||
}
|
||||
if (cmd->getDcFlushEnable()) {
|
||||
dcFlushFound++;
|
||||
}
|
||||
}
|
||||
|
||||
EXPECT_EQ(expectedPostSyncPipeControl, postSyncPipeControls);
|
||||
EXPECT_EQ(expectedDcFlush, dcFlushFound);
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
void testAppendMemoryFillSingleKernelAndL3Flush(ze_event_pool_flags_t eventPoolFlags) {
|
||||
using FamilyType = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
|
||||
using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER;
|
||||
using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA;
|
||||
using OPERATION = typename POSTSYNC_DATA::OPERATION;
|
||||
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
|
||||
using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION;
|
||||
|
||||
ze_event_pool_desc_t eventPoolDesc = {};
|
||||
eventPoolDesc.count = 1;
|
||||
eventPoolDesc.flags = eventPoolFlags;
|
||||
|
||||
ze_event_desc_t eventDesc = {};
|
||||
eventDesc.index = 0;
|
||||
eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
|
||||
|
||||
ze_result_t result = ZE_RESULT_SUCCESS;
|
||||
auto eventPool = std::unique_ptr<L0::EventPool>(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
auto event = std::unique_ptr<L0::Event>(L0::Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
|
||||
|
||||
auto commandList = std::make_unique<WhiteBox<MockCommandList<gfxCoreFamily>>>();
|
||||
commandList->initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
|
||||
int pattern = 0;
|
||||
const size_t size = 1024;
|
||||
uint8_t array[size] = {};
|
||||
|
||||
auto &commandContainer = commandList->commandContainer;
|
||||
size_t usedBefore = commandContainer.getCommandStream()->getUsed();
|
||||
result = commandList->appendMemoryFill(array, &pattern, 1, size, event->toHandle(), 0, nullptr);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
size_t usedAfter = commandContainer.getCommandStream()->getUsed();
|
||||
|
||||
EXPECT_EQ(expectedPacketsInUse, event->getPacketsInUse());
|
||||
EXPECT_EQ(expectedKernelCount, event->getKernelCount());
|
||||
|
||||
uint64_t firstKernelEventAddress = postSyncAddressZero ? 0 : event->getGpuAddress(device);
|
||||
|
||||
GenCmdList cmdList;
|
||||
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
|
||||
cmdList,
|
||||
ptrOffset(commandContainer.getCommandStream()->getCpuBase(), usedBefore),
|
||||
usedAfter - usedBefore));
|
||||
|
||||
auto itorWalkers = findAll<COMPUTE_WALKER *>(cmdList.begin(), cmdList.end());
|
||||
ASSERT_EQ(1u, itorWalkers.size());
|
||||
auto firstWalker = itorWalkers[0];
|
||||
|
||||
auto walkerCmd = genCmdCast<COMPUTE_WALKER *>(*firstWalker);
|
||||
EXPECT_EQ(static_cast<OPERATION>(expectedWalkerPostSyncOp), walkerCmd->getPostSync().getOperation());
|
||||
EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
|
||||
|
||||
uint64_t l3FlushPostSyncAddress = firstKernelEventAddress + 2 * event->getSinglePacketSize();
|
||||
if (event->isUsingContextEndOffset()) {
|
||||
l3FlushPostSyncAddress += event->getContextEndOffset();
|
||||
}
|
||||
|
||||
auto itorPipeControls = findAll<PIPE_CONTROL *>(firstWalker, cmdList.end());
|
||||
|
||||
uint32_t postSyncPipeControls = 0;
|
||||
uint32_t dcFlushFound = 0;
|
||||
for (auto it : itorPipeControls) {
|
||||
auto cmd = genCmdCast<PIPE_CONTROL *>(*it);
|
||||
if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) {
|
||||
postSyncPipeControls++;
|
||||
EXPECT_EQ(l3FlushPostSyncAddress, NEO::UnitTestHelper<FamilyType>::getPipeControlPostSyncAddress(*cmd));
|
||||
EXPECT_EQ(Event::STATE_SIGNALED, cmd->getImmediateData());
|
||||
}
|
||||
if (cmd->getDcFlushEnable()) {
|
||||
dcFlushFound++;
|
||||
}
|
||||
}
|
||||
|
||||
constexpr uint32_t expectedDcFlush = 2; // dc flush for last cross-tile sync and separately for signal scope event after last kernel split
|
||||
EXPECT_EQ(expectedPostSyncPipeControl, postSyncPipeControls);
|
||||
EXPECT_EQ(expectedDcFlush, dcFlushFound);
|
||||
}
|
||||
|
||||
uint32_t expectedPacketsInUse = 0;
|
||||
uint32_t expectedKernelCount = 0;
|
||||
uint32_t expectedWalkerPostSyncOp = 0;
|
||||
uint32_t expectedPostSyncPipeControl = 0;
|
||||
bool postSyncAddressZero = false;
|
||||
};
|
||||
|
||||
if (NEO::MemorySynchronizationCommands<GfxFamily>::getDcFlushEnable(true, device->getHwInfo())) {
|
||||
//laster kernel uses 4 packets, in addition to kernel two packets, use 2 packets to two tile cache flush
|
||||
using MultiTileAppendFillEventMultiPacketTest = Test<MultiTileAppendFillMultiPacketEventFixture<0>>;
|
||||
using MultiTileAppendFillEventSinglePacketTest = Test<MultiTileAppendFillMultiPacketEventFixture<1>>;
|
||||
|
||||
HWTEST2_F(MultiTileAppendFillEventMultiPacketTest,
|
||||
givenMultiTileCmdListCallToAppendMemoryFillWhenSignalScopeTimestampEventUsesComputeWalkerPostSyncThenSeparateKernelsUsesWalkerPostSyncProfilingAndSingleDcFlushWithImmediatePostSync, IsAtLeastXeHpCore) {
|
||||
// two kernels and each kernel uses two packets (for two tiles), in total 4
|
||||
expectedPacketsInUse = 4;
|
||||
expectedKernelCount = 2;
|
||||
expectedWalkerPostSyncOp = 3;
|
||||
expectedPostSyncPipeControl = 0;
|
||||
if (NEO::MemorySynchronizationCommands<FamilyType>::getDcFlushEnable(true, device->getHwInfo())) {
|
||||
// last kernel uses 4 packets, in addition to kernel two packets, use 2 packets to two tile cache flush
|
||||
expectedPacketsInUse = 6;
|
||||
// 1st dc flush after cross-tile sync, 2nd dc flush for signal scope event
|
||||
expectedDcFlush = 2;
|
||||
//cache flush with event signal
|
||||
// cache flush with event signal
|
||||
expectedPostSyncPipeControl = 1;
|
||||
}
|
||||
postSyncAddressZero = false;
|
||||
testAppendMemoryFillManyKernels<gfxCoreFamily>(ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP);
|
||||
}
|
||||
|
||||
EXPECT_EQ(expectedPacketsInUse, event->getPacketsInUse());
|
||||
EXPECT_EQ(2u, event->getKernelCount());
|
||||
|
||||
GenCmdList cmdList;
|
||||
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
|
||||
cmdList,
|
||||
ptrOffset(commandContainer.getCommandStream()->getCpuBase(), usedBefore),
|
||||
usedAfter - usedBefore));
|
||||
|
||||
auto itorWalkers = findAll<COMPUTE_WALKER *>(cmdList.begin(), cmdList.end());
|
||||
ASSERT_EQ(2u, itorWalkers.size());
|
||||
auto firstWalker = itorWalkers[0];
|
||||
auto secondWalker = itorWalkers[1];
|
||||
|
||||
auto walkerCmd = genCmdCast<COMPUTE_WALKER *>(*firstWalker);
|
||||
EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation());
|
||||
EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
|
||||
|
||||
walkerCmd = genCmdCast<COMPUTE_WALKER *>(*secondWalker);
|
||||
EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation());
|
||||
EXPECT_EQ(secondKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
|
||||
|
||||
auto itorPipeControls = findAll<PIPE_CONTROL *>(secondWalker, cmdList.end());
|
||||
|
||||
uint32_t postSyncPipeControls = 0;
|
||||
uint32_t dcFlushFound = 0;
|
||||
|
||||
for (auto it : itorPipeControls) {
|
||||
auto cmd = genCmdCast<PIPE_CONTROL *>(*it);
|
||||
if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) {
|
||||
EXPECT_EQ(Event::STATE_SIGNALED, cmd->getImmediateData());
|
||||
postSyncPipeControls++;
|
||||
}
|
||||
if (cmd->getDcFlushEnable()) {
|
||||
dcFlushFound++;
|
||||
}
|
||||
HWTEST2_F(MultiTileAppendFillEventMultiPacketTest,
|
||||
givenMultiTileCmdListCallToAppendMemoryFillWhenSignalScopeImmediateEventUsesComputeWalkerPostSyncThenSeparateKernelsUsesWalkerPostSyncAndSingleDcFlushWithPostSync, IsAtLeastXeHpCore) {
|
||||
// two kernels and each kernel uses two packets (for two tiles), in total 4
|
||||
expectedPacketsInUse = 4;
|
||||
expectedKernelCount = 2;
|
||||
expectedWalkerPostSyncOp = 3;
|
||||
expectedPostSyncPipeControl = 0;
|
||||
if (NEO::MemorySynchronizationCommands<FamilyType>::getDcFlushEnable(true, device->getHwInfo())) {
|
||||
// last kernel uses 4 packets, in addition to kernel two packets, use 2 packets to two tile cache flush
|
||||
expectedPacketsInUse = 6;
|
||||
// cache flush with event signal
|
||||
expectedPostSyncPipeControl = 1;
|
||||
}
|
||||
postSyncAddressZero = false;
|
||||
testAppendMemoryFillManyKernels<gfxCoreFamily>(0);
|
||||
}
|
||||
|
||||
EXPECT_EQ(expectedPostSyncPipeControl, postSyncPipeControls);
|
||||
EXPECT_EQ(expectedDcFlush, dcFlushFound);
|
||||
HWTEST2_F(MultiTileAppendFillEventMultiPacketTest,
|
||||
givenMultiTileCmdListCallToAppendMemoryFillWhenSignalScopeTimestampEventUsesComputeWalkerPostSyncThenSingleKernelsUsesWalkerPostSyncProfilingAndSingleDcFlushWithImmediatePostSync, IsXeHpOrXeHpgCore) {
|
||||
// kernel uses 4 packets, in addition to kernel two packets, use 2 packets to two tile cache flush
|
||||
expectedPacketsInUse = 4;
|
||||
expectedKernelCount = 1;
|
||||
expectedWalkerPostSyncOp = 3;
|
||||
// cache flush with event signal
|
||||
expectedPostSyncPipeControl = 1;
|
||||
postSyncAddressZero = false;
|
||||
|
||||
testAppendMemoryFillSingleKernelAndL3Flush<gfxCoreFamily>(ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP);
|
||||
}
|
||||
|
||||
HWTEST2_F(MultiTileAppendFillEventMultiPacketTest,
|
||||
givenMultiTileCmdListCallToAppendMemoryFillWhenSignalScopeImmediateEventUsesComputeWalkerPostSyncThenSingleKernelUsesWalkerPostSyncAndSingleDcFlushWithPostSync, IsXeHpOrXeHpgCore) {
|
||||
// kernel uses 4 packets, in addition to kernel two packets, use 2 packets to two tile cache flush
|
||||
expectedPacketsInUse = 4;
|
||||
expectedKernelCount = 1;
|
||||
expectedWalkerPostSyncOp = 3;
|
||||
// cache flush with event signal
|
||||
expectedPostSyncPipeControl = 1;
|
||||
|
||||
testAppendMemoryFillSingleKernelAndL3Flush<gfxCoreFamily>(0);
|
||||
}
|
||||
|
||||
HWTEST2_F(MultiTileAppendFillEventSinglePacketTest,
|
||||
givenMultiTileCmdListCallToAppendMemoryFillWhenSignalScopeTimestampEventUsesRegisterPostSyncThenSeparateKernelsNotUsesWalkerPostSyncProfilingAndDcFlushWithNoPostSync, IsAtLeastXeHpCore) {
|
||||
expectedPacketsInUse = 2;
|
||||
expectedKernelCount = 1;
|
||||
expectedWalkerPostSyncOp = 0;
|
||||
expectedPostSyncPipeControl = 0;
|
||||
postSyncAddressZero = true;
|
||||
testAppendMemoryFillManyKernels<gfxCoreFamily>(ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP);
|
||||
}
|
||||
|
||||
HWTEST2_F(MultiTileAppendFillEventSinglePacketTest,
|
||||
givenMultiTileCmdListCallToAppendMemoryFillWhenSignalScopeImmediateEventUsesPipeControlPostSyncThenSeparateKernelsNotUsesWalkerPostSyncProfilingAndDcFlushWithImmediatePostSync, IsAtLeastXeHpCore) {
|
||||
expectedPacketsInUse = 2;
|
||||
expectedKernelCount = 1;
|
||||
expectedPacketsInUse = 2;
|
||||
expectedWalkerPostSyncOp = 0;
|
||||
expectedPostSyncPipeControl = 1;
|
||||
postSyncAddressZero = true;
|
||||
testAppendMemoryFillManyKernels<gfxCoreFamily>(0);
|
||||
}
|
||||
|
||||
} // namespace ult
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -629,5 +629,11 @@ HWTEST2_F(L0HwHelperTest, whenAlwaysAllocateEventInLocalMemCalledThenReturnFalse
|
||||
EXPECT_FALSE(l0HwHelper.alwaysAllocateEventInLocalMem());
|
||||
}
|
||||
|
||||
TEST_F(L0HwHelperTest, givenL0HelperWhenGettingDefaultValueForUsePipeControlMultiKernelEventSyncThenReturnFalse) {
|
||||
auto hwInfo = *NEO::defaultHwInfo.get();
|
||||
bool defaultValue = L0::L0HwHelper::usePipeControlMultiKernelEventSync(hwInfo);
|
||||
EXPECT_FALSE(defaultValue);
|
||||
}
|
||||
|
||||
} // namespace ult
|
||||
} // namespace L0
|
||||
|
||||
@@ -654,7 +654,7 @@ HWTEST2_F(CommandListAppendLaunchKernelXeHpcCore,
|
||||
result = commandList->appendPageFaultCopy(dstAllocation, srcAllocation, size, false);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_TRUE(commandList->usedKernelLaunchParams.isBuiltInKernel);
|
||||
EXPECT_TRUE(commandList->usedKernelLaunchParams.isKernelSplitOperation);
|
||||
EXPECT_FALSE(commandList->usedKernelLaunchParams.isKernelSplitOperation);
|
||||
EXPECT_TRUE(commandList->usedKernelLaunchParams.isDestinationAllocationInSystemMemory);
|
||||
|
||||
GenCmdList commands;
|
||||
@@ -1144,7 +1144,8 @@ class MockAppendMemoryLockedCopyTestImmediateCmdList : public MockCommandListImm
|
||||
uint64_t srcOffset, uint64_t size,
|
||||
uint64_t elementSize, Builtin builtin,
|
||||
Event *signalEvent,
|
||||
bool isStateless) override {
|
||||
bool isStateless,
|
||||
CmdListKernelLaunchParams &launchParams) override {
|
||||
appendMemoryCopyKernelWithGACalled++;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
@@ -417,6 +417,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, UseDrmCompletionFenceForAllAllocations, -1, "Use
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, EnableChipsetUniqueUUID, -1, "Enables retrieving chipset unique UUID using telemetry, -1:default (enabled), 0:disable, 1:enable")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, EnableFlushTaskSubmission, -1, "Driver uses csr flushTask for immediate commandlist submissions, -1:default (enabled), 0:disabled, 1:enabled")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, EnableImmediateCmdListHeapSharing, -1, "Immediate command lists using flush task use current csr heap instead private cmd list heap, -1:default (disabled), 0:disabled, 1:enabled")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, UsePipeControlMultiKernelEventSync, -1, "Use single PIPE_CONTROL for event signal of multi-kernel append operations instead multi-packet POSTSYNC_DATA from each COMPUTE_WALKER, -1: default , 0: disabled, 1: enabled")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, EnableBcsSwControlWa, -1, "Enable BCS WA via BCSSWCONTROL MMIO. -1: default, 0: disabled, 1: if src in system mem, 2: if dst in system mem, 3: if src and dst in system mem, 4: always")
|
||||
|
||||
/* IMPLICIT SCALING */
|
||||
|
||||
@@ -69,6 +69,7 @@ ForceAuxTranslationEnabled = -1
|
||||
DisableTimestampPacketOptimizations = 0
|
||||
DisableCachingForStatefulBufferAccess = 0
|
||||
PrintDebugSettings = 0
|
||||
UsePipeControlMultiKernelEventSync = -1
|
||||
PrintDebugMessages = 0
|
||||
DumpKernels = 0
|
||||
DumpKernelArgs = 0
|
||||
|
||||
Reference in New Issue
Block a user