Use single event for multiple kernels

Related-To: NEO-6871

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz
2022-04-08 18:48:45 +00:00
committed by Compute-Runtime-Automation
parent a3745c28a3
commit 819d648997
25 changed files with 1074 additions and 338 deletions

View File

@@ -181,4 +181,16 @@ void CommandList::handleIndirectAllocationResidency() {
} }
} }
bool CommandList::setupTimestampEventForMultiTile(ze_event_handle_t signalEvent) {
if (this->partitionCount > 1 &&
signalEvent) {
auto event = Event::fromHandle(signalEvent);
if (event->isEventTimestampFlagSet()) {
event->setPacketsInUse(this->partitionCount);
return true;
}
}
return false;
}
} // namespace L0 } // namespace L0

View File

@@ -264,6 +264,7 @@ struct CommandList : _ze_command_list_handle_t {
protected: protected:
NEO::GraphicsAllocation *getAllocationFromHostPtrMap(const void *buffer, uint64_t bufferSize); NEO::GraphicsAllocation *getAllocationFromHostPtrMap(const void *buffer, uint64_t bufferSize);
NEO::GraphicsAllocation *getHostPtrAlloc(const void *buffer, uint64_t bufferSize, bool hostCopyAllowed); NEO::GraphicsAllocation *getHostPtrAlloc(const void *buffer, uint64_t bufferSize, bool hostCopyAllowed);
bool setupTimestampEventForMultiTile(ze_event_handle_t signalEvent);
std::map<const void *, NEO::GraphicsAllocation *> hostPtrMap; std::map<const void *, NEO::GraphicsAllocation *> hostPtrMap;
std::vector<NEO::GraphicsAllocation *> ownedPrivateAllocations; std::vector<NEO::GraphicsAllocation *> ownedPrivateAllocations;

View File

@@ -230,7 +230,7 @@ struct CommandListCoreFamily : CommandListImp {
ze_result_t setGlobalWorkSizeIndirect(NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress, uint32_t lws[3]); ze_result_t setGlobalWorkSizeIndirect(NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress, uint32_t lws[3]);
ze_result_t programSyncBuffer(Kernel &kernel, NEO::Device &device, const ze_group_count_t *pThreadGroupDimensions); ze_result_t programSyncBuffer(Kernel &kernel, NEO::Device &device, const ze_group_count_t *pThreadGroupDimensions);
void appendWriteKernelTimestamp(ze_event_handle_t hEvent, bool beforeWalker, bool maskLsb, bool workloadPartition); void appendWriteKernelTimestamp(ze_event_handle_t hEvent, bool beforeWalker, bool maskLsb, bool workloadPartition);
void adjustWriteKernelTimestamp(uint64_t globalAddress, uint64_t contextAddress, bool maskLsb, uint32_t mask); void adjustWriteKernelTimestamp(uint64_t globalAddress, uint64_t contextAddress, bool maskLsb, uint32_t mask, bool workloadPartition);
void appendEventForProfiling(ze_event_handle_t hEvent, bool beforeWalker, bool workloadPartition); void appendEventForProfiling(ze_event_handle_t hEvent, bool beforeWalker, bool workloadPartition);
void appendEventForProfilingAllWalkers(ze_event_handle_t hEvent, bool beforeWalker); void appendEventForProfilingAllWalkers(ze_event_handle_t hEvent, bool beforeWalker);
void appendEventForProfilingCopyCommand(ze_event_handle_t hEvent, bool beforeWalker); void appendEventForProfilingCopyCommand(ze_event_handle_t hEvent, bool beforeWalker);

View File

@@ -364,9 +364,11 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryRangesBarrier(uint
return ret; return ret;
} }
appendEventForProfiling(hSignalEvent, true, false); bool workloadPartition = setupTimestampEventForMultiTile(hSignalEvent);
appendEventForProfiling(hSignalEvent, true, workloadPartition);
applyMemoryRangesBarrier(numRanges, pRangeSizes, pRanges); applyMemoryRangesBarrier(numRanges, pRangeSizes, pRanges);
appendSignalEventPostWalker(hSignalEvent, false); appendSignalEventPostWalker(hSignalEvent, workloadPartition);
if (this->cmdListType == CommandListType::TYPE_IMMEDIATE) { if (this->cmdListType == CommandListType::TYPE_IMMEDIATE) {
executeCommandListImmediate(true); executeCommandListImmediate(true);
@@ -800,22 +802,6 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemAdvise(ze_device_hand
return ZE_RESULT_ERROR_INVALID_ARGUMENT; return ZE_RESULT_ERROR_INVALID_ARGUMENT;
} }
template <GFXCORE_FAMILY gfxCoreFamily>
ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelSplit(ze_kernel_handle_t hKernel,
const ze_group_count_t *pThreadGroupDimensions,
ze_event_handle_t hEvent) {
return appendLaunchKernelWithParams(hKernel, pThreadGroupDimensions, nullptr, false, false, false);
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::appendEventForProfilingAllWalkers(ze_event_handle_t hEvent, bool beforeWalker) {
if (beforeWalker) {
appendEventForProfiling(hEvent, true, false);
} else {
appendSignalEventPostWalker(hEvent, false);
}
}
template <GFXCORE_FAMILY gfxCoreFamily> template <GFXCORE_FAMILY gfxCoreFamily>
ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyKernelWithGA(void *dstPtr, ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyKernelWithGA(void *dstPtr,
NEO::GraphicsAllocation *dstPtrAlloc, NEO::GraphicsAllocation *dstPtrAlloc,
@@ -1075,18 +1061,21 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(void *dstptr,
if (isStateless) { if (isStateless) {
func = Builtin::CopyBufferToBufferSideStateless; func = Builtin::CopyBufferToBufferSideStateless;
} }
ret = isCopyOnly() ? appendMemoryCopyBlit(dstAllocationStruct.alignedAllocationPtr, if (isCopyOnly()) {
dstAllocationStruct.alloc, dstAllocationStruct.offset, ret = appendMemoryCopyBlit(dstAllocationStruct.alignedAllocationPtr,
srcAllocationStruct.alignedAllocationPtr, dstAllocationStruct.alloc, dstAllocationStruct.offset,
srcAllocationStruct.alloc, srcAllocationStruct.offset, leftSize) srcAllocationStruct.alignedAllocationPtr,
: appendMemoryCopyKernelWithGA(reinterpret_cast<void *>(&dstAllocationStruct.alignedAllocationPtr), srcAllocationStruct.alloc, srcAllocationStruct.offset, leftSize);
dstAllocationStruct.alloc, dstAllocationStruct.offset, } else {
reinterpret_cast<void *>(&srcAllocationStruct.alignedAllocationPtr), ret = appendMemoryCopyKernelWithGA(reinterpret_cast<void *>(&dstAllocationStruct.alignedAllocationPtr),
srcAllocationStruct.alloc, srcAllocationStruct.offset, dstAllocationStruct.alloc, dstAllocationStruct.offset,
leftSize, 1UL, reinterpret_cast<void *>(&srcAllocationStruct.alignedAllocationPtr),
func, srcAllocationStruct.alloc, srcAllocationStruct.offset,
hSignalEvent, leftSize, 1UL,
isStateless); func,
hSignalEvent,
isStateless);
}
} }
if (ret == ZE_RESULT_SUCCESS && middleSizeBytes) { if (ret == ZE_RESULT_SUCCESS && middleSizeBytes) {
@@ -1094,19 +1083,22 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(void *dstptr,
if (isStateless) { if (isStateless) {
func = Builtin::CopyBufferToBufferMiddleStateless; func = Builtin::CopyBufferToBufferMiddleStateless;
} }
ret = isCopyOnly() ? appendMemoryCopyBlit(dstAllocationStruct.alignedAllocationPtr, if (isCopyOnly()) {
dstAllocationStruct.alloc, leftSize + dstAllocationStruct.offset, ret = appendMemoryCopyBlit(dstAllocationStruct.alignedAllocationPtr,
srcAllocationStruct.alignedAllocationPtr, dstAllocationStruct.alloc, leftSize + dstAllocationStruct.offset,
srcAllocationStruct.alloc, leftSize + srcAllocationStruct.offset, middleSizeBytes) srcAllocationStruct.alignedAllocationPtr,
: appendMemoryCopyKernelWithGA(reinterpret_cast<void *>(&dstAllocationStruct.alignedAllocationPtr), srcAllocationStruct.alloc, leftSize + srcAllocationStruct.offset, middleSizeBytes);
dstAllocationStruct.alloc, leftSize + dstAllocationStruct.offset, } else {
reinterpret_cast<void *>(&srcAllocationStruct.alignedAllocationPtr), ret = appendMemoryCopyKernelWithGA(reinterpret_cast<void *>(&dstAllocationStruct.alignedAllocationPtr),
srcAllocationStruct.alloc, leftSize + srcAllocationStruct.offset, dstAllocationStruct.alloc, leftSize + dstAllocationStruct.offset,
middleSizeBytes, reinterpret_cast<void *>(&srcAllocationStruct.alignedAllocationPtr),
middleElSize, srcAllocationStruct.alloc, leftSize + srcAllocationStruct.offset,
func, middleSizeBytes,
hSignalEvent, middleElSize,
isStateless); func,
hSignalEvent,
isStateless);
}
} }
if (ret == ZE_RESULT_SUCCESS && rightSize) { if (ret == ZE_RESULT_SUCCESS && rightSize) {
@@ -1114,18 +1106,21 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(void *dstptr,
if (isStateless) { if (isStateless) {
func = Builtin::CopyBufferToBufferSideStateless; func = Builtin::CopyBufferToBufferSideStateless;
} }
ret = isCopyOnly() ? appendMemoryCopyBlit(dstAllocationStruct.alignedAllocationPtr, if (isCopyOnly()) {
dstAllocationStruct.alloc, leftSize + middleSizeBytes + dstAllocationStruct.offset, ret = appendMemoryCopyBlit(dstAllocationStruct.alignedAllocationPtr,
srcAllocationStruct.alignedAllocationPtr, dstAllocationStruct.alloc, leftSize + middleSizeBytes + dstAllocationStruct.offset,
srcAllocationStruct.alloc, leftSize + middleSizeBytes + srcAllocationStruct.offset, rightSize) srcAllocationStruct.alignedAllocationPtr,
: appendMemoryCopyKernelWithGA(reinterpret_cast<void *>(&dstAllocationStruct.alignedAllocationPtr), srcAllocationStruct.alloc, leftSize + middleSizeBytes + srcAllocationStruct.offset, rightSize);
dstAllocationStruct.alloc, leftSize + middleSizeBytes + dstAllocationStruct.offset, } else {
reinterpret_cast<void *>(&srcAllocationStruct.alignedAllocationPtr), ret = appendMemoryCopyKernelWithGA(reinterpret_cast<void *>(&dstAllocationStruct.alignedAllocationPtr),
srcAllocationStruct.alloc, leftSize + middleSizeBytes + srcAllocationStruct.offset, dstAllocationStruct.alloc, leftSize + middleSizeBytes + dstAllocationStruct.offset,
rightSize, 1UL, reinterpret_cast<void *>(&srcAllocationStruct.alignedAllocationPtr),
func, srcAllocationStruct.alloc, leftSize + middleSizeBytes + srcAllocationStruct.offset,
hSignalEvent, rightSize, 1UL,
isStateless); func,
hSignalEvent,
isStateless);
}
} }
appendEventForProfilingAllWalkers(hSignalEvent, false); appendEventForProfilingAllWalkers(hSignalEvent, false);
@@ -1557,6 +1552,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
reinterpret_cast<uintptr_t>(patternGfxAllocPtr) + patternOffsetRemainder, reinterpret_cast<uintptr_t>(patternGfxAllocPtr) + patternOffsetRemainder,
patternGfxAlloc); patternGfxAlloc);
builtinFunctionRemainder->setArgumentValue(3, sizeof(patternAllocationSize), &patternAllocationSize); builtinFunctionRemainder->setArgumentValue(3, sizeof(patternAllocationSize), &patternAllocationSize);
res = appendLaunchKernelSplit(builtinFunctionRemainder->toHandle(), &dispatchFuncArgs, hSignalEvent); res = appendLaunchKernelSplit(builtinFunctionRemainder->toHandle(), &dispatchFuncArgs, hSignalEvent);
if (res) { if (res) {
return res; return res;
@@ -1951,7 +1947,7 @@ void CommandListCoreFamily<gfxCoreFamily>::appendWriteKernelTimestamp(ze_event_h
constexpr uint32_t mask = 0xfffffffe; constexpr uint32_t mask = 0xfffffffe;
auto event = Event::fromHandle(hEvent); auto event = Event::fromHandle(hEvent);
auto baseAddr = event->getGpuAddress(this->device); auto baseAddr = event->getPacketAddress(this->device);
auto contextOffset = beforeWalker ? event->getContextStartOffset() : event->getContextEndOffset(); auto contextOffset = beforeWalker ? event->getContextStartOffset() : event->getContextEndOffset();
auto globalOffset = beforeWalker ? event->getGlobalStartOffset() : event->getGlobalEndOffset(); auto globalOffset = beforeWalker ? event->getGlobalStartOffset() : event->getGlobalEndOffset();
@@ -1966,7 +1962,7 @@ void CommandListCoreFamily<gfxCoreFamily>::appendWriteKernelTimestamp(ze_event_h
NEO::EncodeStoreMMIO<GfxFamily>::encode(*commandContainer.getCommandStream(), GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, contextAddress, workloadPartition); NEO::EncodeStoreMMIO<GfxFamily>::encode(*commandContainer.getCommandStream(), GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, contextAddress, workloadPartition);
} }
adjustWriteKernelTimestamp(globalAddress, contextAddress, maskLsb, mask); adjustWriteKernelTimestamp(globalAddress, contextAddress, maskLsb, mask, workloadPartition);
} }
template <GFXCORE_FAMILY gfxCoreFamily> template <GFXCORE_FAMILY gfxCoreFamily>
@@ -2018,6 +2014,9 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendWriteGlobalTimestamp(
} }
} }
bool workloadPartition = setupTimestampEventForMultiTile(hSignalEvent);
appendEventForProfiling(hSignalEvent, true, workloadPartition);
const auto &hwInfo = this->device->getHwInfo(); const auto &hwInfo = this->device->getHwInfo();
if (isCopyOnly()) { if (isCopyOnly()) {
NEO::MiFlushArgs args; NEO::MiFlushArgs args;
@@ -2031,17 +2030,16 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendWriteGlobalTimestamp(
} else { } else {
NEO::PipeControlArgs args; NEO::PipeControlArgs args;
NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControlWithPostSync( NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControlAndProgramPostSyncOperation(
*commandContainer.getCommandStream(), *commandContainer.getCommandStream(),
POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_TIMESTAMP, POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_TIMESTAMP,
reinterpret_cast<uint64_t>(dstptr), reinterpret_cast<uint64_t>(dstptr),
0, 0,
hwInfo,
args); args);
} }
if (hSignalEvent) { appendSignalEventPostWalker(hSignalEvent, workloadPartition);
CommandListCoreFamily<gfxCoreFamily>::appendSignalEventPostWalker(hSignalEvent, false);
}
auto allocationStruct = getAlignedAllocation(this->device, dstptr, sizeof(uint64_t), false); auto allocationStruct = getAlignedAllocation(this->device, dstptr, sizeof(uint64_t), false);
commandContainer.addToResidencyContainer(allocationStruct.alloc); commandContainer.addToResidencyContainer(allocationStruct.alloc);
@@ -2263,7 +2261,7 @@ void CommandListCoreFamily<gfxCoreFamily>::programStateBaseAddress(NEO::CommandC
} }
template <GFXCORE_FAMILY gfxCoreFamily> template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::adjustWriteKernelTimestamp(uint64_t globalAddress, uint64_t contextAddress, bool maskLsb, uint32_t mask) {} void CommandListCoreFamily<gfxCoreFamily>::adjustWriteKernelTimestamp(uint64_t globalAddress, uint64_t contextAddress, bool maskLsb, uint32_t mask, bool workloadPartition) {}
template <GFXCORE_FAMILY gfxCoreFamily> template <GFXCORE_FAMILY gfxCoreFamily>
ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendBarrier(ze_event_handle_t hSignalEvent, ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendBarrier(ze_event_handle_t hSignalEvent,
@@ -2274,15 +2272,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendBarrier(ze_event_handle_
if (ret) { if (ret) {
return ret; return ret;
} }
bool workloadPartition = false; bool workloadPartition = setupTimestampEventForMultiTile(hSignalEvent);
if (this->partitionCount > 1 &&
hSignalEvent) {
auto event = Event::fromHandle(hSignalEvent);
if (event->isEventTimestampFlagSet()) {
event->setPacketsInUse(this->partitionCount);
workloadPartition = true;
}
}
appendEventForProfiling(hSignalEvent, true, workloadPartition); appendEventForProfiling(hSignalEvent, true, workloadPartition);
if (isCopyOnly()) { if (isCopyOnly()) {

View File

@@ -201,4 +201,20 @@ inline size_t CommandListCoreFamily<gfxCoreFamily>::estimateBufferSizeMultiTileB
return 0; return 0;
} }
template <GFXCORE_FAMILY gfxCoreFamily>
ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelSplit(ze_kernel_handle_t hKernel,
const ze_group_count_t *pThreadGroupDimensions,
ze_event_handle_t hEvent) {
return appendLaunchKernelWithParams(hKernel, pThreadGroupDimensions, nullptr, false, false, false);
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::appendEventForProfilingAllWalkers(ze_event_handle_t hEvent, bool beforeWalker) {
if (beforeWalker) {
appendEventForProfiling(hEvent, true, false);
} else {
appendSignalEventPostWalker(hEvent, false);
}
}
} // namespace L0 } // namespace L0

View File

@@ -344,4 +344,30 @@ inline size_t CommandListCoreFamily<gfxCoreFamily>::estimateBufferSizeMultiTileB
false); false);
} }
template <GFXCORE_FAMILY gfxCoreFamily>
ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelSplit(ze_kernel_handle_t hKernel,
const ze_group_count_t *pThreadGroupDimensions,
ze_event_handle_t hEvent) {
if (hEvent) {
Event::fromHandle(hEvent)->increaseKernelCount();
}
return appendLaunchKernelWithParams(hKernel, pThreadGroupDimensions, hEvent, false, false, false);
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::appendEventForProfilingAllWalkers(ze_event_handle_t hEvent, bool beforeWalker) {
if (isCopyOnly()) {
if (beforeWalker) {
appendEventForProfiling(hEvent, true, false);
} else {
appendSignalEventPostWalker(hEvent, false);
}
} else {
if (hEvent && beforeWalker) {
auto event = Event::fromHandle(hEvent);
event->zeroKernelCount();
}
}
}
} // namespace L0 } // namespace L0

View File

@@ -96,6 +96,17 @@ struct Event : _ze_event_handle_t {
return isTimestampEvent || usingContextEndOffset; return isTimestampEvent || usingContextEndOffset;
} }
void increaseKernelCount() {
kernelCount++;
UNRECOVERABLE_IF(kernelCount > EventPacketsCount::maxKernelSplit);
}
uint32_t getKernelCount() const {
return kernelCount;
}
void zeroKernelCount() {
kernelCount = 0;
}
uint64_t globalStartTS; uint64_t globalStartTS;
uint64_t globalEndTS; uint64_t globalEndTS;
uint64_t contextStartTS; uint64_t contextStartTS;
@@ -110,8 +121,6 @@ struct Event : _ze_event_handle_t {
ze_event_scope_flags_t signalScope = 0u; ze_event_scope_flags_t signalScope = 0u;
ze_event_scope_flags_t waitScope = 0u; ze_event_scope_flags_t waitScope = 0u;
uint32_t kernelCount = 1u;
bool l3FlushWaApplied = false; bool l3FlushWaApplied = false;
protected: protected:
@@ -122,6 +131,9 @@ struct Event : _ze_event_handle_t {
size_t timestampSizeInDw = 0u; size_t timestampSizeInDw = 0u;
size_t singlePacketSize = 0u; size_t singlePacketSize = 0u;
size_t eventPoolOffset = 0u; size_t eventPoolOffset = 0u;
uint32_t kernelCount = 1u;
bool isTimestampEvent = false; bool isTimestampEvent = false;
bool usingContextEndOffset = false; bool usingContextEndOffset = false;
}; };
@@ -180,8 +192,7 @@ struct EventImp : public Event {
protected: protected:
ze_result_t calculateProfilingData(); ze_result_t calculateProfilingData();
ze_result_t queryStatusKernelTimestamp(); ze_result_t queryStatusEventPackets();
ze_result_t queryStatusNonTimestamp();
ze_result_t hostEventSetValue(TagSizeT eventValue); ze_result_t hostEventSetValue(TagSizeT eventValue);
ze_result_t hostEventSetValueTimestamps(TagSizeT eventVal); ze_result_t hostEventSetValueTimestamps(TagSizeT eventVal);
void assignKernelEventCompletionData(void *address); void assignKernelEventCompletionData(void *address);

View File

@@ -104,33 +104,13 @@ void EventImp<TagSizeT>::assignKernelEventCompletionData(void *address) {
} }
template <typename TagSizeT> template <typename TagSizeT>
ze_result_t EventImp<TagSizeT>::queryStatusKernelTimestamp() { ze_result_t EventImp<TagSizeT>::queryStatusEventPackets() {
assignKernelEventCompletionData(hostAddress); assignKernelEventCompletionData(hostAddress);
uint32_t queryVal = Event::STATE_CLEARED; uint32_t queryVal = Event::STATE_CLEARED;
for (uint32_t i = 0; i < kernelCount; i++) { for (uint32_t i = 0; i < kernelCount; i++) {
uint32_t packetsToCheck = kernelEventCompletionData[i].getPacketsUsed(); uint32_t packetsToCheck = kernelEventCompletionData[i].getPacketsUsed();
for (uint32_t packetId = 0; packetId < packetsToCheck; packetId++) { for (uint32_t packetId = 0; packetId < packetsToCheck; packetId++) {
bool ready = NEO::WaitUtils::waitFunctionWithPredicate<const TagSizeT>( void const *queryAddress = isUsingContextEndOffset()
static_cast<TagSizeT const *>(kernelEventCompletionData[i].getContextEndAddress(packetId)),
queryVal,
std::not_equal_to<TagSizeT>());
if (!ready) {
return ZE_RESULT_NOT_READY;
}
}
}
this->csr->getInternalAllocationStorage()->cleanAllocationList(this->csr->peekTaskCount(), NEO::AllocationUsage::TEMPORARY_ALLOCATION);
return ZE_RESULT_SUCCESS;
}
template <typename TagSizeT>
ze_result_t EventImp<TagSizeT>::queryStatusNonTimestamp() {
assignKernelEventCompletionData(hostAddress);
uint32_t queryVal = Event::STATE_CLEARED;
for (uint32_t i = 0; i < kernelCount; i++) {
uint32_t packetsToCheck = kernelEventCompletionData[i].getPacketsUsed();
for (uint32_t packetId = 0; packetId < packetsToCheck; packetId++) {
void const *queryAddress = usingContextEndOffset
? kernelEventCompletionData[i].getContextEndAddress(packetId) ? kernelEventCompletionData[i].getContextEndAddress(packetId)
: kernelEventCompletionData[i].getContextStartAddress(packetId); : kernelEventCompletionData[i].getContextStartAddress(packetId);
bool ready = NEO::WaitUtils::waitFunctionWithPredicate<const TagSizeT>( bool ready = NEO::WaitUtils::waitFunctionWithPredicate<const TagSizeT>(
@@ -156,11 +136,7 @@ ze_result_t EventImp<TagSizeT>::queryStatus() {
*hostAddr = metricStreamer->getNotificationState(); *hostAddr = metricStreamer->getNotificationState();
} }
this->csr->downloadAllocations(); this->csr->downloadAllocations();
if (isEventTimestampFlagSet()) { return queryStatusEventPackets();
return queryStatusKernelTimestamp();
} else {
return queryStatusNonTimestamp();
}
} }
template <typename TagSizeT> template <typename TagSizeT>
@@ -274,11 +250,9 @@ ze_result_t EventImp<TagSizeT>::hostSynchronize(uint64_t timeout) {
template <typename TagSizeT> template <typename TagSizeT>
ze_result_t EventImp<TagSizeT>::reset() { ze_result_t EventImp<TagSizeT>::reset() {
if (isEventTimestampFlagSet()) { kernelCount = EventPacketsCount::maxKernelSplit;
kernelCount = EventPacketsCount::maxKernelSplit; for (uint32_t i = 0; i < kernelCount; i++) {
for (uint32_t i = 0; i < kernelCount; i++) { kernelEventCompletionData[i].setPacketsUsed(NEO::TimestampPacketSizeControl::preferredPacketCount);
kernelEventCompletionData[i].setPacketsUsed(NEO::TimestampPacketSizeControl::preferredPacketCount);
}
} }
hostEventSetValue(Event::STATE_INITIAL); hostEventSetValue(Event::STATE_INITIAL);
resetPackets(); resetPackets();

View File

@@ -8,6 +8,8 @@
#pragma once #pragma once
#include "shared/source/command_container/implicit_scaling.h" #include "shared/source/command_container/implicit_scaling.h"
#include "shared/test/common/cmd_parse/gen_cmd_parse.h"
#include "shared/test/common/helpers/unit_test_helper.h"
#include "shared/test/common/helpers/variable_backup.h" #include "shared/test/common/helpers/variable_backup.h"
#include "shared/test/common/test_macros/test.h" #include "shared/test/common/test_macros/test.h"
@@ -90,5 +92,96 @@ struct MultiTileCommandListFixture : public SingleRootMultiSubDeviceFixture {
std::unique_ptr<VariableBackup<bool>> osLocalMemoryBackup; std::unique_ptr<VariableBackup<bool>> osLocalMemoryBackup;
}; };
template <typename FamilyType>
void validateTimestampRegisters(GenCmdList &cmdList,
GenCmdList::iterator &startIt,
uint32_t firstLoadRegisterRegSrcAddress,
uint64_t firstStoreRegMemAddress,
uint32_t secondLoadRegisterRegSrcAddress,
uint64_t secondStoreRegMemAddress,
bool workloadPartition) {
using MI_LOAD_REGISTER_REG = typename FamilyType::MI_LOAD_REGISTER_REG;
using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM;
using MI_MATH = typename FamilyType::MI_MATH;
using MI_STORE_REGISTER_MEM = typename FamilyType::MI_STORE_REGISTER_MEM;
constexpr uint32_t mask = 0xfffffffe;
auto itor = find<MI_LOAD_REGISTER_REG *>(startIt, cmdList.end());
{
ASSERT_NE(cmdList.end(), itor);
auto cmdLoadReg = genCmdCast<MI_LOAD_REGISTER_REG *>(*itor);
EXPECT_EQ(firstLoadRegisterRegSrcAddress, cmdLoadReg->getSourceRegisterAddress());
EXPECT_EQ(CS_GPR_R0, cmdLoadReg->getDestinationRegisterAddress());
}
itor++;
{
ASSERT_NE(cmdList.end(), itor);
auto cmdLoadImm = genCmdCast<MI_LOAD_REGISTER_IMM *>(*itor);
EXPECT_EQ(CS_GPR_R1, cmdLoadImm->getRegisterOffset());
EXPECT_EQ(mask, cmdLoadImm->getDataDword());
}
itor++;
{
ASSERT_NE(cmdList.end(), itor);
auto cmdMath = genCmdCast<MI_MATH *>(*itor);
EXPECT_EQ(3u, cmdMath->DW0.BitField.DwordLength);
}
itor++;
{
ASSERT_NE(cmdList.end(), itor);
auto cmdMem = genCmdCast<MI_STORE_REGISTER_MEM *>(*itor);
EXPECT_EQ(CS_GPR_R2, cmdMem->getRegisterAddress());
EXPECT_EQ(firstStoreRegMemAddress, cmdMem->getMemoryAddress());
if (workloadPartition) {
EXPECT_TRUE(UnitTestHelper<FamilyType>::getWorkloadPartitionForStoreRegisterMemCmd(*cmdMem));
} else {
EXPECT_FALSE(UnitTestHelper<FamilyType>::getWorkloadPartitionForStoreRegisterMemCmd(*cmdMem));
}
}
itor++;
{
ASSERT_NE(cmdList.end(), itor);
auto cmdLoadReg = genCmdCast<MI_LOAD_REGISTER_REG *>(*itor);
EXPECT_EQ(secondLoadRegisterRegSrcAddress, cmdLoadReg->getSourceRegisterAddress());
EXPECT_EQ(CS_GPR_R0, cmdLoadReg->getDestinationRegisterAddress());
}
itor++;
{
ASSERT_NE(cmdList.end(), itor);
auto cmdLoadImm = genCmdCast<MI_LOAD_REGISTER_IMM *>(*itor);
EXPECT_EQ(CS_GPR_R1, cmdLoadImm->getRegisterOffset());
EXPECT_EQ(mask, cmdLoadImm->getDataDword());
}
itor++;
{
ASSERT_NE(cmdList.end(), itor);
auto cmdMath = genCmdCast<MI_MATH *>(*itor);
EXPECT_EQ(3u, cmdMath->DW0.BitField.DwordLength);
}
itor++;
{
ASSERT_NE(cmdList.end(), itor);
auto cmdMem = genCmdCast<MI_STORE_REGISTER_MEM *>(*itor);
EXPECT_EQ(CS_GPR_R2, cmdMem->getRegisterAddress());
EXPECT_EQ(secondStoreRegMemAddress, cmdMem->getMemoryAddress());
if (workloadPartition) {
EXPECT_TRUE(UnitTestHelper<FamilyType>::getWorkloadPartitionForStoreRegisterMemCmd(*cmdMem));
} else {
EXPECT_FALSE(UnitTestHelper<FamilyType>::getWorkloadPartitionForStoreRegisterMemCmd(*cmdMem));
}
}
itor++;
startIt = itor;
}
} // namespace ult } // namespace ult
} // namespace L0 } // namespace L0

View File

@@ -347,5 +347,76 @@ struct MockCommandList : public CommandList {
uint8_t *batchBuffer = nullptr; uint8_t *batchBuffer = nullptr;
NEO::GraphicsAllocation *mockAllocation = nullptr; NEO::GraphicsAllocation *mockAllocation = nullptr;
}; };
template <GFXCORE_FAMILY gfxCoreFamily>
class MockAppendMemoryCopy : public CommandListCoreFamily<gfxCoreFamily> {
public:
using BaseClass = CommandListCoreFamily<gfxCoreFamily>;
ADDMETHOD(appendMemoryCopyKernelWithGA, ze_result_t, false, ZE_RESULT_SUCCESS,
(void *dstPtr, NEO::GraphicsAllocation *dstPtrAlloc,
uint64_t dstOffset, void *srcPtr,
NEO::GraphicsAllocation *srcPtrAlloc,
uint64_t srcOffset, uint64_t size,
uint64_t elementSize, Builtin builtin,
ze_event_handle_t hSignalEvent,
bool isStateless),
(dstPtr, dstPtrAlloc, dstOffset, srcPtr, srcPtrAlloc, srcOffset, size, elementSize, builtin, hSignalEvent, isStateless));
ADDMETHOD_NOBASE(appendMemoryCopyBlit, ze_result_t, ZE_RESULT_SUCCESS,
(uintptr_t dstPtr,
NEO::GraphicsAllocation *dstPtrAlloc,
uint64_t dstOffset, uintptr_t srcPtr,
NEO::GraphicsAllocation *srcPtrAlloc,
uint64_t srcOffset,
uint64_t size));
AlignedAllocationData getAlignedAllocation(L0::Device *device, const void *buffer, uint64_t bufferSize, bool allowHostCopy) override {
return L0::CommandListCoreFamily<gfxCoreFamily>::getAlignedAllocation(device, buffer, bufferSize, allowHostCopy);
}
ze_result_t appendMemoryCopyKernel2d(AlignedAllocationData *dstAlignedAllocation, AlignedAllocationData *srcAlignedAllocation,
Builtin builtin, const ze_copy_region_t *dstRegion,
uint32_t dstPitch, size_t dstOffset,
const ze_copy_region_t *srcRegion, uint32_t srcPitch,
size_t srcOffset, ze_event_handle_t hSignalEvent,
uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override {
srcAlignedPtr = srcAlignedAllocation->alignedAllocationPtr;
dstAlignedPtr = dstAlignedAllocation->alignedAllocationPtr;
return L0::CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyKernel2d(dstAlignedAllocation, srcAlignedAllocation, builtin, dstRegion, dstPitch, dstOffset, srcRegion, srcPitch, srcOffset, hSignalEvent, numWaitEvents, phWaitEvents);
}
ze_result_t appendMemoryCopyKernel3d(AlignedAllocationData *dstAlignedAllocation, AlignedAllocationData *srcAlignedAllocation,
Builtin builtin, const ze_copy_region_t *dstRegion,
uint32_t dstPitch, uint32_t dstSlicePitch, size_t dstOffset,
const ze_copy_region_t *srcRegion, uint32_t srcPitch,
uint32_t srcSlicePitch, size_t srcOffset,
ze_event_handle_t hSignalEvent, uint32_t numWaitEvents,
ze_event_handle_t *phWaitEvents) override {
srcAlignedPtr = srcAlignedAllocation->alignedAllocationPtr;
dstAlignedPtr = dstAlignedAllocation->alignedAllocationPtr;
return L0::CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyKernel3d(dstAlignedAllocation, srcAlignedAllocation, builtin, dstRegion, dstPitch, dstSlicePitch, dstOffset, srcRegion, srcPitch, srcSlicePitch, srcOffset, hSignalEvent, numWaitEvents, phWaitEvents);
}
ze_result_t appendMemoryCopyBlitRegion(NEO::GraphicsAllocation *srcAllocation,
NEO::GraphicsAllocation *dstAllocation,
size_t srcOffset,
size_t dstOffset,
ze_copy_region_t srcRegion,
ze_copy_region_t dstRegion, const Vec3<size_t> &copySize,
size_t srcRowPitch, size_t srcSlicePitch,
size_t dstRowPitch, size_t dstSlicePitch,
const Vec3<size_t> &srcSize, const Vec3<size_t> &dstSize, ze_event_handle_t hSignalEvent,
uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override {
srcBlitCopyRegionOffset = srcOffset;
dstBlitCopyRegionOffset = dstOffset;
return L0::CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyBlitRegion(srcAllocation, dstAllocation, srcOffset, dstOffset, srcRegion, dstRegion, copySize, srcRowPitch, srcSlicePitch, dstRowPitch, dstSlicePitch, srcSize, dstSize, hSignalEvent, numWaitEvents, phWaitEvents);
}
uintptr_t srcAlignedPtr;
uintptr_t dstAlignedPtr;
size_t srcBlitCopyRegionOffset = 0;
size_t dstBlitCopyRegionOffset = 0;
};
} // namespace ult } // namespace ult
} // namespace L0 } // namespace L0

View File

@@ -501,13 +501,6 @@ HWTEST2_F(CommandListCreate, givenCommandListWhenMemoryCopyWithSignalEventsThenS
itor++; itor++;
itor = find<SEMAPHORE_WAIT *>(itor, cmdList.end()); itor = find<SEMAPHORE_WAIT *>(itor, cmdList.end());
EXPECT_NE(cmdList.end(), itor); EXPECT_NE(cmdList.end(), itor);
itor++;
itor = find<PIPE_CONTROL *>(itor, cmdList.end());
if (MemorySynchronizationCommands<FamilyType>::getDcFlushEnable(true, *defaultHwInfo)) {
EXPECT_NE(cmdList.end(), itor);
} else {
EXPECT_EQ(cmdList.end(), itor);
}
} }
using platformSupport = IsWithinProducts<IGFX_SKYLAKE, IGFX_TIGERLAKE_LP>; using platformSupport = IsWithinProducts<IGFX_SKYLAKE, IGFX_TIGERLAKE_LP>;
@@ -540,22 +533,18 @@ HWTEST2_F(CommandListCreate, givenCommandListWhenMemoryCopyWithSignalEventScopeS
cmdList, ptrOffset(commandContainer.getCommandStream()->getCpuBase(), 0), commandContainer.getCommandStream()->getUsed())); cmdList, ptrOffset(commandContainer.getCommandStream()->getCpuBase(), 0), commandContainer.getCommandStream()->getUsed()));
auto iterator = findAll<PIPE_CONTROL *>(cmdList.begin(), cmdList.end()); auto iterator = findAll<PIPE_CONTROL *>(cmdList.begin(), cmdList.end());
bool postSyncFound = false; uint32_t postSyncFound = 0;
ASSERT_NE(0u, iterator.size()); ASSERT_NE(0u, iterator.size());
uint32_t numPCs = 0;
for (auto it : iterator) { for (auto it : iterator) {
auto cmd = genCmdCast<PIPE_CONTROL *>(*it); auto cmd = genCmdCast<PIPE_CONTROL *>(*it);
numPCs++;
if ((cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) && if ((cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) &&
(cmd->getImmediateData() == Event::STATE_SIGNALED) && (cmd->getImmediateData() == Event::STATE_SIGNALED) &&
(cmd->getDcFlushEnable())) { (cmd->getDcFlushEnable())) {
postSyncFound = true; postSyncFound++;
break;
} }
} }
ASSERT_TRUE(postSyncFound); EXPECT_EQ(1u, postSyncFound);
EXPECT_EQ(numPCs, iterator.size());
} }
HWTEST2_F(CommandListCreate, givenCommandListWhenMemoryCopyWithSignalEventScopeSetToSubDeviceThenB2BPipeControlIsAddedWithDcFlushForLastPC, platformSupport) { HWTEST2_F(CommandListCreate, givenCommandListWhenMemoryCopyWithSignalEventScopeSetToSubDeviceThenB2BPipeControlIsAddedWithDcFlushForLastPC, platformSupport) {
@@ -585,22 +574,18 @@ HWTEST2_F(CommandListCreate, givenCommandListWhenMemoryCopyWithSignalEventScopeS
cmdList, ptrOffset(commandContainer.getCommandStream()->getCpuBase(), 0), commandContainer.getCommandStream()->getUsed())); cmdList, ptrOffset(commandContainer.getCommandStream()->getCpuBase(), 0), commandContainer.getCommandStream()->getUsed()));
auto iterator = findAll<PIPE_CONTROL *>(cmdList.begin(), cmdList.end()); auto iterator = findAll<PIPE_CONTROL *>(cmdList.begin(), cmdList.end());
bool postSyncFound = false; uint32_t postSyncFound = 0;
ASSERT_NE(0u, iterator.size()); ASSERT_NE(0u, iterator.size());
uint32_t numPCs = 0;
for (auto it : iterator) { for (auto it : iterator) {
auto cmd = genCmdCast<PIPE_CONTROL *>(*it); auto cmd = genCmdCast<PIPE_CONTROL *>(*it);
numPCs++;
if ((cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) && if ((cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) &&
(cmd->getImmediateData() == Event::STATE_SIGNALED) && (cmd->getImmediateData() == Event::STATE_SIGNALED) &&
(!cmd->getDcFlushEnable())) { (!cmd->getDcFlushEnable())) {
postSyncFound = true; postSyncFound++;
break;
} }
} }
ASSERT_TRUE(postSyncFound); EXPECT_EQ(1u, postSyncFound);
EXPECT_EQ(numPCs, iterator.size() - 1);
auto it = *(iterator.end() - 1); auto it = *(iterator.end() - 1);
auto cmd1 = genCmdCast<PIPE_CONTROL *>(*it); auto cmd1 = genCmdCast<PIPE_CONTROL *>(*it);

View File

@@ -77,12 +77,19 @@ HWTEST2_F(CommandListCreate, givenCommandListWhenAppendWriteGlobalTimestampCalle
ptrOffset(commandContainer.getCommandStream()->getCpuBase(), commandStreamOffset), ptrOffset(commandContainer.getCommandStream()->getCpuBase(), commandStreamOffset),
commandContainer.getCommandStream()->getUsed() - commandStreamOffset)); commandContainer.getCommandStream()->getUsed() - commandStreamOffset));
auto iterator = find<PIPE_CONTROL *>(cmdList.begin(), cmdList.end()); auto pcList = findAll<PIPE_CONTROL *>(cmdList.begin(), cmdList.end());
auto cmd = genCmdCast<PIPE_CONTROL *>(*iterator); ASSERT_NE(0u, pcList.size());
EXPECT_TRUE(cmd->getCommandStreamerStallEnable()); bool foundTimestampPipeControl = false;
EXPECT_FALSE(cmd->getDcFlushEnable()); for (auto it : pcList) {
EXPECT_EQ(timestampAddress, NEO::UnitTestHelper<FamilyType>::getPipeControlPostSyncAddress(*cmd)); auto cmd = genCmdCast<PIPE_CONTROL *>(*it);
EXPECT_EQ(POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_TIMESTAMP, cmd->getPostSyncOperation()); if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_TIMESTAMP) {
EXPECT_TRUE(cmd->getCommandStreamerStallEnable());
EXPECT_FALSE(cmd->getDcFlushEnable());
EXPECT_EQ(timestampAddress, NEO::UnitTestHelper<FamilyType>::getPipeControlPostSyncAddress(*cmd));
foundTimestampPipeControl = true;
}
}
EXPECT_TRUE(foundTimestampPipeControl);
} }
HWTEST2_F(CommandListCreate, givenCommandListWhenAppendWriteGlobalTimestampCalledThenTimestampAllocationIsInsideResidencyContainer, IsAtLeastSkl) { HWTEST2_F(CommandListCreate, givenCommandListWhenAppendWriteGlobalTimestampCalledThenTimestampAllocationIsInsideResidencyContainer, IsAtLeastSkl) {

View File

@@ -7,7 +7,6 @@
#include "shared/source/command_container/command_encoder.h" #include "shared/source/command_container/command_encoder.h"
#include "shared/source/helpers/hw_helper.h" #include "shared/source/helpers/hw_helper.h"
#include "shared/test/common/cmd_parse/gen_cmd_parse.h"
#include "shared/test/common/helpers/unit_test_helper.h" #include "shared/test/common/helpers/unit_test_helper.h"
#include "shared/test/common/test_macros/test.h" #include "shared/test/common/test_macros/test.h"
@@ -382,82 +381,6 @@ HWTEST2_F(MultiTileCommandListAppendBarrier,
EXPECT_EQ(1u, postSyncFound); EXPECT_EQ(1u, postSyncFound);
} }
template <typename FamilyType>
void validateTimestampRegisters(GenCmdList &cmdList,
uint64_t firstRegisterAddress, uint64_t secondRegisterAddress) {
using MI_LOAD_REGISTER_REG = typename FamilyType::MI_LOAD_REGISTER_REG;
using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM;
using MI_MATH = typename FamilyType::MI_MATH;
using MI_STORE_REGISTER_MEM = typename FamilyType::MI_STORE_REGISTER_MEM;
constexpr uint32_t mask = 0xfffffffe;
auto itor = find<MI_LOAD_REGISTER_REG *>(cmdList.begin(), cmdList.end());
{
ASSERT_NE(cmdList.end(), itor);
auto cmdLoadReg = genCmdCast<MI_LOAD_REGISTER_REG *>(*itor);
EXPECT_EQ(REG_GLOBAL_TIMESTAMP_LDW, cmdLoadReg->getSourceRegisterAddress());
EXPECT_EQ(CS_GPR_R0, cmdLoadReg->getDestinationRegisterAddress());
}
itor++;
{
ASSERT_NE(cmdList.end(), itor);
auto cmdLoadImm = genCmdCast<MI_LOAD_REGISTER_IMM *>(*itor);
EXPECT_EQ(CS_GPR_R1, cmdLoadImm->getRegisterOffset());
EXPECT_EQ(mask, cmdLoadImm->getDataDword());
}
itor++;
{
ASSERT_NE(cmdList.end(), itor);
auto cmdMath = genCmdCast<MI_MATH *>(*itor);
EXPECT_EQ(3u, cmdMath->DW0.BitField.DwordLength);
}
itor++;
{
ASSERT_NE(cmdList.end(), itor);
auto cmdMem = genCmdCast<MI_STORE_REGISTER_MEM *>(*itor);
EXPECT_EQ(CS_GPR_R2, cmdMem->getRegisterAddress());
EXPECT_EQ(firstRegisterAddress, cmdMem->getMemoryAddress());
EXPECT_TRUE(cmdMem->getWorkloadPartitionIdOffsetEnable());
}
itor++;
{
ASSERT_NE(cmdList.end(), itor);
auto cmdLoadReg = genCmdCast<MI_LOAD_REGISTER_REG *>(*itor);
EXPECT_EQ(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, cmdLoadReg->getSourceRegisterAddress());
EXPECT_EQ(CS_GPR_R0, cmdLoadReg->getDestinationRegisterAddress());
}
itor++;
{
ASSERT_NE(cmdList.end(), itor);
auto cmdLoadImm = genCmdCast<MI_LOAD_REGISTER_IMM *>(*itor);
EXPECT_EQ(CS_GPR_R1, cmdLoadImm->getRegisterOffset());
EXPECT_EQ(mask, cmdLoadImm->getDataDword());
}
itor++;
{
ASSERT_NE(cmdList.end(), itor);
auto cmdMath = genCmdCast<MI_MATH *>(*itor);
EXPECT_EQ(3u, cmdMath->DW0.BitField.DwordLength);
}
itor++;
{
ASSERT_NE(cmdList.end(), itor);
auto cmdMem = genCmdCast<MI_STORE_REGISTER_MEM *>(*itor);
EXPECT_EQ(CS_GPR_R2, cmdMem->getRegisterAddress());
EXPECT_EQ(secondRegisterAddress, cmdMem->getMemoryAddress());
EXPECT_TRUE(cmdMem->getWorkloadPartitionIdOffsetEnable());
}
}
HWTEST2_F(MultiTileCommandListAppendBarrier, HWTEST2_F(MultiTileCommandListAppendBarrier,
GivenTimestampEventSignalWhenAppendingMultTileBarrierThenExpectMultiTileBarrierAndTimestampOperations, IsWithinXeGfxFamily) { GivenTimestampEventSignalWhenAppendingMultTileBarrierThenExpectMultiTileBarrierAndTimestampOperations, IsWithinXeGfxFamily) {
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
@@ -533,7 +456,12 @@ HWTEST2_F(MultiTileCommandListAppendBarrier,
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList, ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList,
cmdBuffer, cmdBuffer,
timestampRegisters)); timestampRegisters));
validateTimestampRegisters<FamilyType>(cmdList, globalStartAddress, contextStartAddress); auto begin = cmdList.begin();
validateTimestampRegisters<FamilyType>(cmdList,
begin,
REG_GLOBAL_TIMESTAMP_LDW, globalStartAddress,
GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, contextStartAddress,
true);
auto gpuBaseAddress = cmdListStream->getGraphicsAllocation()->getGpuAddress() + useSizeBefore + timestampRegisters; auto gpuBaseAddress = cmdListStream->getGraphicsAllocation()->getGpuAddress() + useSizeBefore + timestampRegisters;
@@ -557,7 +485,12 @@ HWTEST2_F(MultiTileCommandListAppendBarrier,
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList, ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList,
cmdBuffer, cmdBuffer,
timestampRegisters)); timestampRegisters));
validateTimestampRegisters<FamilyType>(cmdList, globalEndAddress, contextEndAddress); begin = cmdList.begin();
validateTimestampRegisters<FamilyType>(cmdList,
begin,
REG_GLOBAL_TIMESTAMP_LDW, globalEndAddress,
GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, contextEndAddress,
true);
} }
} // namespace ult } // namespace ult

View File

@@ -8,6 +8,7 @@
#include "shared/test/common/cmd_parse/gen_cmd_parse.h" #include "shared/test/common/cmd_parse/gen_cmd_parse.h"
#include "shared/test/common/test_macros/test.h" #include "shared/test/common/test_macros/test.h"
#include "level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.h"
#include "level_zero/core/test/unit_tests/fixtures/device_fixture.h" #include "level_zero/core/test/unit_tests/fixtures/device_fixture.h"
#include "level_zero/core/test/unit_tests/mocks/mock_cmdlist.h" #include "level_zero/core/test/unit_tests/mocks/mock_cmdlist.h"
#include "level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h" #include "level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h"
@@ -17,70 +18,6 @@ namespace ult {
using AppendMemoryCopy = Test<DeviceFixture>; using AppendMemoryCopy = Test<DeviceFixture>;
template <GFXCORE_FAMILY gfxCoreFamily>
class MockAppendMemoryCopy : public WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>> {
public:
ADDMETHOD_NOBASE(appendMemoryCopyKernelWithGA, ze_result_t, ZE_RESULT_SUCCESS,
(void *dstPtr, NEO::GraphicsAllocation *dstPtrAlloc,
uint64_t dstOffset, void *srcPtr,
NEO::GraphicsAllocation *srcPtrAlloc,
uint64_t srcOffset, uint64_t size,
uint64_t elementSize, Builtin builtin,
ze_event_handle_t hSignalEvent,
bool isStateless));
ADDMETHOD_NOBASE(appendMemoryCopyBlit, ze_result_t, ZE_RESULT_SUCCESS,
(uintptr_t dstPtr,
NEO::GraphicsAllocation *dstPtrAlloc,
uint64_t dstOffset, uintptr_t srcPtr,
NEO::GraphicsAllocation *srcPtrAlloc,
uint64_t srcOffset,
uint64_t size));
AlignedAllocationData getAlignedAllocation(L0::Device *device, const void *buffer, uint64_t bufferSize, bool allowHostCopy) override {
return L0::CommandListCoreFamily<gfxCoreFamily>::getAlignedAllocation(device, buffer, bufferSize, allowHostCopy);
}
ze_result_t appendMemoryCopyKernel2d(AlignedAllocationData *dstAlignedAllocation, AlignedAllocationData *srcAlignedAllocation,
Builtin builtin, const ze_copy_region_t *dstRegion,
uint32_t dstPitch, size_t dstOffset,
const ze_copy_region_t *srcRegion, uint32_t srcPitch,
size_t srcOffset, ze_event_handle_t hSignalEvent,
uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override {
srcAlignedPtr = srcAlignedAllocation->alignedAllocationPtr;
dstAlignedPtr = dstAlignedAllocation->alignedAllocationPtr;
return L0::CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyKernel2d(dstAlignedAllocation, srcAlignedAllocation, builtin, dstRegion, dstPitch, dstOffset, srcRegion, srcPitch, srcOffset, hSignalEvent, numWaitEvents, phWaitEvents);
}
ze_result_t appendMemoryCopyKernel3d(AlignedAllocationData *dstAlignedAllocation, AlignedAllocationData *srcAlignedAllocation,
Builtin builtin, const ze_copy_region_t *dstRegion,
uint32_t dstPitch, uint32_t dstSlicePitch, size_t dstOffset,
const ze_copy_region_t *srcRegion, uint32_t srcPitch,
uint32_t srcSlicePitch, size_t srcOffset,
ze_event_handle_t hSignalEvent, uint32_t numWaitEvents,
ze_event_handle_t *phWaitEvents) override {
srcAlignedPtr = srcAlignedAllocation->alignedAllocationPtr;
dstAlignedPtr = dstAlignedAllocation->alignedAllocationPtr;
return L0::CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyKernel3d(dstAlignedAllocation, srcAlignedAllocation, builtin, dstRegion, dstPitch, dstSlicePitch, dstOffset, srcRegion, srcPitch, srcSlicePitch, srcOffset, hSignalEvent, numWaitEvents, phWaitEvents);
}
ze_result_t appendMemoryCopyBlitRegion(NEO::GraphicsAllocation *srcAllocation,
NEO::GraphicsAllocation *dstAllocation,
size_t srcOffset,
size_t dstOffset,
ze_copy_region_t srcRegion,
ze_copy_region_t dstRegion, const Vec3<size_t> &copySize,
size_t srcRowPitch, size_t srcSlicePitch,
size_t dstRowPitch, size_t dstSlicePitch,
const Vec3<size_t> &srcSize, const Vec3<size_t> &dstSize, ze_event_handle_t hSignalEvent,
uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override {
srcBlitCopyRegionOffset = srcOffset;
dstBlitCopyRegionOffset = dstOffset;
return L0::CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyBlitRegion(srcAllocation, dstAllocation, srcOffset, dstOffset, srcRegion, dstRegion, copySize, srcRowPitch, srcSlicePitch, dstRowPitch, dstSlicePitch, srcSize, dstSize, hSignalEvent, numWaitEvents, phWaitEvents);
}
uintptr_t srcAlignedPtr;
uintptr_t dstAlignedPtr;
size_t srcBlitCopyRegionOffset = 0;
size_t dstBlitCopyRegionOffset = 0;
};
HWTEST2_F(AppendMemoryCopy, givenCommandListAndHostPointersWhenMemoryCopyRegionCalledThenTwoNewAllocationAreAddedToHostMapPtr, IsAtLeastSkl) { HWTEST2_F(AppendMemoryCopy, givenCommandListAndHostPointersWhenMemoryCopyRegionCalledThenTwoNewAllocationAreAddedToHostMapPtr, IsAtLeastSkl) {
MockAppendMemoryCopy<gfxCoreFamily> cmdList; MockAppendMemoryCopy<gfxCoreFamily> cmdList;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
@@ -343,12 +280,14 @@ HWTEST2_F(AppendMemoryCopy, givenCopyCommandListWhenTimestampPassedToMemoryCopyT
} }
using SupportedPlatforms = IsWithinProducts<IGFX_SKYLAKE, IGFX_DG1>; using SupportedPlatforms = IsWithinProducts<IGFX_SKYLAKE, IGFX_DG1>;
HWTEST2_F(AppendMemoryCopy, givenCommandListWhenTimestampPassedToMemoryCopyThenAppendProfilingCalledOnceBeforeAndAfterCommand, SupportedPlatforms) { HWTEST2_F(AppendMemoryCopy,
givenCommandListUsesTimestampPassedToMemoryCopyWhenTwoKernelsAreUsedThenAppendProfilingCalledForSinglePacket, SupportedPlatforms) {
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily; using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
using MI_LOAD_REGISTER_REG = typename GfxFamily::MI_LOAD_REGISTER_REG; using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER;
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
MockAppendMemoryCopy<gfxCoreFamily> commandList; MockAppendMemoryCopy<gfxCoreFamily> commandList;
commandList.appendMemoryCopyKernelWithGACallBase = true;
commandList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); commandList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
void *srcPtr = reinterpret_cast<void *>(0x1234); void *srcPtr = reinterpret_cast<void *>(0x1234);
void *dstPtr = reinterpret_cast<void *>(0x2345); void *dstPtr = reinterpret_cast<void *>(0x2345);
@@ -365,65 +304,97 @@ HWTEST2_F(AppendMemoryCopy, givenCommandListWhenTimestampPassedToMemoryCopyThenA
EXPECT_EQ(ZE_RESULT_SUCCESS, result); EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto event = std::unique_ptr<L0::Event>(L0::Event::create<uint32_t>(eventPool.get(), &eventDesc, device)); auto event = std::unique_ptr<L0::Event>(L0::Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
uint64_t globalStartAddress = event->getGpuAddress(device) + event->getGlobalStartOffset();
uint64_t contextStartAddress = event->getGpuAddress(device) + event->getContextStartOffset();
uint64_t globalEndAddress = event->getGpuAddress(device) + event->getGlobalEndOffset();
uint64_t contextEndAddress = event->getGpuAddress(device) + event->getContextEndOffset();
commandList.appendMemoryCopy(dstPtr, srcPtr, 0x100, event->toHandle(), 0, nullptr); commandList.appendMemoryCopy(dstPtr, srcPtr, 0x100, event->toHandle(), 0, nullptr);
EXPECT_GT(commandList.appendMemoryCopyKernelWithGACalled, 0u); EXPECT_EQ(2u, commandList.appendMemoryCopyKernelWithGACalled);
EXPECT_EQ(commandList.appendMemoryCopyBlitCalled, 0u); EXPECT_EQ(0u, commandList.appendMemoryCopyBlitCalled);
EXPECT_EQ(1u, event->getPacketsInUse()); EXPECT_EQ(1u, event->getPacketsInUse());
EXPECT_EQ(1u, event->getKernelCount());
GenCmdList cmdList; GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList, ptrOffset(commandList.commandContainer.getCommandStream()->getCpuBase(), 0), cmdList, ptrOffset(commandList.commandContainer.getCommandStream()->getCpuBase(), 0),
commandList.commandContainer.getCommandStream()->getUsed())); commandList.commandContainer.getCommandStream()->getUsed()));
auto itor = find<MI_LOAD_REGISTER_REG *>(cmdList.begin(), cmdList.end());
EXPECT_NE(cmdList.end(), itor);
{
auto cmd = genCmdCast<MI_LOAD_REGISTER_REG *>(*itor);
EXPECT_EQ(cmd->getSourceRegisterAddress(), REG_GLOBAL_TIMESTAMP_LDW);
}
itor++; auto itorWalkers = findAll<GPGPU_WALKER *>(cmdList.begin(), cmdList.end());
itor = find<MI_LOAD_REGISTER_REG *>(itor, cmdList.end()); auto begin = cmdList.begin();
EXPECT_NE(cmdList.end(), itor); ASSERT_EQ(2u, itorWalkers.size());
{ auto secondWalker = itorWalkers[1];
auto cmd = genCmdCast<MI_LOAD_REGISTER_REG *>(*itor);
EXPECT_EQ(cmd->getSourceRegisterAddress(), GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW);
}
itor++; validateTimestampRegisters<FamilyType>(cmdList,
itor = find<PIPE_CONTROL *>(itor, cmdList.end()); begin,
EXPECT_NE(cmdList.end(), itor); REG_GLOBAL_TIMESTAMP_LDW, globalStartAddress,
{ GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, contextStartAddress,
auto cmd = genCmdCast<PIPE_CONTROL *>(*itor); false);
EXPECT_FALSE(cmd->getDcFlushEnable());
}
itor++; validateTimestampRegisters<FamilyType>(cmdList,
itor = find<MI_LOAD_REGISTER_REG *>(itor, cmdList.end()); secondWalker,
EXPECT_NE(cmdList.end(), itor); REG_GLOBAL_TIMESTAMP_LDW, globalEndAddress,
{ GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, contextEndAddress,
auto cmd = genCmdCast<MI_LOAD_REGISTER_REG *>(*itor); false);
EXPECT_EQ(cmd->getSourceRegisterAddress(), REG_GLOBAL_TIMESTAMP_LDW);
}
itor++;
itor = find<MI_LOAD_REGISTER_REG *>(itor, cmdList.end());
EXPECT_NE(cmdList.end(), itor);
{
auto cmd = genCmdCast<MI_LOAD_REGISTER_REG *>(*itor);
EXPECT_EQ(cmd->getSourceRegisterAddress(), GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW);
}
auto temp = itor;
auto numPCs = findAll<PIPE_CONTROL *>(temp, cmdList.end());
//we should have only one PC with dcFlush added
ASSERT_EQ(1u, numPCs.size());
itor = find<PIPE_CONTROL *>(itor, cmdList.end());
EXPECT_NE(cmdList.end(), itor);
{
auto cmd = genCmdCast<PIPE_CONTROL *>(*itor);
EXPECT_EQ(MemorySynchronizationCommands<FamilyType>::getDcFlushEnable(true, *defaultHwInfo), cmd->getDcFlushEnable());
}
} }
HWTEST2_F(AppendMemoryCopy,
givenCommandListUsesTimestampPassedToMemoryCopyWhenThreeKernelsAreUsedThenAppendProfilingCalledForSinglePacket, SupportedPlatforms) {
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER;
MockAppendMemoryCopy<gfxCoreFamily> commandList;
commandList.appendMemoryCopyKernelWithGACallBase = true;
commandList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
void *srcPtr = reinterpret_cast<void *>(0x1231);
void *dstPtr = reinterpret_cast<void *>(0x200002345);
ze_event_pool_desc_t eventPoolDesc = {};
eventPoolDesc.count = 1;
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
ze_event_desc_t eventDesc = {};
eventDesc.index = 0;
ze_result_t result = ZE_RESULT_SUCCESS;
auto eventPool = std::unique_ptr<L0::EventPool>(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result));
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto event = std::unique_ptr<L0::Event>(L0::Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
uint64_t globalStartAddress = event->getGpuAddress(device) + event->getGlobalStartOffset();
uint64_t contextStartAddress = event->getGpuAddress(device) + event->getContextStartOffset();
uint64_t globalEndAddress = event->getGpuAddress(device) + event->getGlobalEndOffset();
uint64_t contextEndAddress = event->getGpuAddress(device) + event->getContextEndOffset();
commandList.appendMemoryCopy(dstPtr, srcPtr, 0x100002345, event->toHandle(), 0, nullptr);
EXPECT_EQ(3u, commandList.appendMemoryCopyKernelWithGACalled);
EXPECT_EQ(0u, commandList.appendMemoryCopyBlitCalled);
EXPECT_EQ(1u, event->getPacketsInUse());
EXPECT_EQ(1u, event->getKernelCount());
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList, ptrOffset(commandList.commandContainer.getCommandStream()->getCpuBase(), 0),
commandList.commandContainer.getCommandStream()->getUsed()));
auto itorWalkers = findAll<GPGPU_WALKER *>(cmdList.begin(), cmdList.end());
auto begin = cmdList.begin();
ASSERT_EQ(3u, itorWalkers.size());
auto thirdWalker = itorWalkers[2];
validateTimestampRegisters<FamilyType>(cmdList,
begin,
REG_GLOBAL_TIMESTAMP_LDW, globalStartAddress,
GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, contextStartAddress,
false);
validateTimestampRegisters<FamilyType>(cmdList,
thirdWalker,
REG_GLOBAL_TIMESTAMP_LDW, globalEndAddress,
GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, contextEndAddress,
false);
}
} // namespace ult } // namespace ult
} // namespace L0 } // namespace L0

View File

@@ -355,5 +355,79 @@ HWTEST2_F(CommandListAppendSignalEvent,
EXPECT_EQ(1u, postSyncFound); EXPECT_EQ(1u, postSyncFound);
} }
HWTEST2_F(CommandListAppendSignalEvent,
givenMultiTileCommandListWhenAppendWriteGlobalTimestampCalledWithSignalEventThenWorkPartitionedRegistersAreUsed, IsAtLeastXeHpCore) {
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION;
auto &commandContainer = commandList->commandContainer;
uint64_t timestampAddress = 0x12345678555500;
uint64_t *dstptr = reinterpret_cast<uint64_t *>(timestampAddress);
constexpr uint32_t packets = 2u;
event->setEventTimestampFlag(true);
commandList->partitionCount = packets;
commandList->appendWriteGlobalTimestamp(dstptr, event->toHandle(), 0, nullptr);
EXPECT_EQ(packets, event->getPacketsInUse());
auto eventGpuAddress = event->getGpuAddress(device);
uint64_t contextStartAddress = eventGpuAddress + event->getContextStartOffset();
uint64_t globalStartAddress = eventGpuAddress + event->getGlobalStartOffset();
uint64_t contextEndAddress = eventGpuAddress + event->getContextEndOffset();
uint64_t globalEndAddress = eventGpuAddress + event->getGlobalEndOffset();
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList, ptrOffset(commandContainer.getCommandStream()->getCpuBase(), 0), commandContainer.getCommandStream()->getUsed()));
auto itorPC = find<PIPE_CONTROL *>(cmdList.begin(), cmdList.end());
EXPECT_NE(cmdList.end(), itorPC);
auto cmd = genCmdCast<PIPE_CONTROL *>(*itorPC);
while (cmd->getPostSyncOperation() != POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_TIMESTAMP) {
itorPC++;
itorPC = find<PIPE_CONTROL *>(itorPC, cmdList.end());
EXPECT_NE(cmdList.end(), itorPC);
cmd = genCmdCast<PIPE_CONTROL *>(*itorPC);
}
EXPECT_TRUE(cmd->getCommandStreamerStallEnable());
EXPECT_FALSE(cmd->getDcFlushEnable());
EXPECT_EQ(timestampAddress, NEO::UnitTestHelper<FamilyType>::getPipeControlPostSyncAddress(*cmd));
auto startCmdList = cmdList.begin();
validateTimestampRegisters<FamilyType>(cmdList,
startCmdList,
REG_GLOBAL_TIMESTAMP_LDW, globalStartAddress,
GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, contextStartAddress,
true);
if (UnitTestHelper<FamilyType>::timestampRegisterHighAddress()) {
uint64_t globalStartAddressHigh = globalStartAddress + sizeof(uint32_t);
uint64_t contextStartAddressHigh = contextStartAddress + sizeof(uint32_t);
validateTimestampRegisters<FamilyType>(cmdList,
startCmdList,
REG_GLOBAL_TIMESTAMP_UN, globalStartAddressHigh,
0x23AC, contextStartAddressHigh,
true);
}
validateTimestampRegisters<FamilyType>(cmdList,
startCmdList,
REG_GLOBAL_TIMESTAMP_LDW, globalEndAddress,
GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, contextEndAddress,
true);
if (UnitTestHelper<FamilyType>::timestampRegisterHighAddress()) {
uint64_t globalEndAddressHigh = globalEndAddress + sizeof(uint32_t);
uint64_t contextEndAddressHigh = contextEndAddress + sizeof(uint32_t);
validateTimestampRegisters<FamilyType>(cmdList,
startCmdList,
REG_GLOBAL_TIMESTAMP_UN, globalEndAddressHigh,
0x23AC, contextEndAddressHigh,
true);
}
}
} // namespace ult } // namespace ult
} // namespace L0 } // namespace L0

View File

@@ -210,9 +210,9 @@ HWTEST_F(CommandListAppendWaitOnEvent, WhenAppendingWaitOnTimestampEventWithThre
auto event = std::unique_ptr<L0::Event>(L0::Event::create<uint32_t>(eventPool.get(), &eventDesc, device)); auto event = std::unique_ptr<L0::Event>(L0::Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
event->setPacketsInUse(3u); event->setPacketsInUse(3u);
event->kernelCount = 2; event->increaseKernelCount();
event->setPacketsInUse(3u); event->setPacketsInUse(3u);
event->kernelCount = 3; event->increaseKernelCount();
event->setPacketsInUse(3u); event->setPacketsInUse(3u);
ASSERT_EQ(9u, event->getPacketsInUse()); ASSERT_EQ(9u, event->getPacketsInUse());

View File

@@ -6,11 +6,13 @@
*/ */
#include "shared/source/memory_manager/memory_manager.h" #include "shared/source/memory_manager/memory_manager.h"
#include "shared/test/common/cmd_parse/gen_cmd_parse.h"
#include "shared/test/common/mocks/mock_graphics_allocation.h" #include "shared/test/common/mocks/mock_graphics_allocation.h"
#include "shared/test/common/test_macros/test.h" #include "shared/test/common/test_macros/test.h"
#include "level_zero/core/source/builtin/builtin_functions_lib_impl.h" #include "level_zero/core/source/builtin/builtin_functions_lib_impl.h"
#include "level_zero/core/source/kernel/kernel_imp.h" #include "level_zero/core/source/kernel/kernel_imp.h"
#include "level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.h"
#include "level_zero/core/test/unit_tests/fixtures/device_fixture.h" #include "level_zero/core/test/unit_tests/fixtures/device_fixture.h"
#include "level_zero/core/test/unit_tests/mocks/mock_built_ins.h" #include "level_zero/core/test/unit_tests/mocks/mock_built_ins.h"
#include "level_zero/core/test/unit_tests/mocks/mock_cmdlist.h" #include "level_zero/core/test/unit_tests/mocks/mock_cmdlist.h"
@@ -210,5 +212,217 @@ HWTEST2_F(AppendFillTest,
delete[] nonMultipleDstPtr; delete[] nonMultipleDstPtr;
} }
using IsBetweenGen9AndGen12lp = IsWithinGfxCore<IGFX_GEN9_CORE, IGFX_GEN12LP_CORE>;
HWTEST2_F(AppendFillTest,
givenCallToAppendMemoryFillWithImmediateValueWhenTimestampEventUsesRegistersThenSinglePacketUsesRegisterProfiling, IsBetweenGen9AndGen12lp) {
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER;
ze_event_pool_desc_t eventPoolDesc = {};
eventPoolDesc.count = 1;
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
ze_event_desc_t eventDesc = {};
eventDesc.index = 0;
ze_result_t result = ZE_RESULT_SUCCESS;
auto eventPool = std::unique_ptr<L0::EventPool>(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result));
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto event = std::unique_ptr<L0::Event>(L0::Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
uint64_t globalStartAddress = event->getGpuAddress(device) + event->getGlobalStartOffset();
uint64_t contextStartAddress = event->getGpuAddress(device) + event->getContextStartOffset();
uint64_t globalEndAddress = event->getGpuAddress(device) + event->getGlobalEndOffset();
uint64_t contextEndAddress = event->getGpuAddress(device) + event->getContextEndOffset();
auto commandList = std::make_unique<WhiteBox<MockCommandList<gfxCoreFamily>>>();
commandList->initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
result = commandList->appendMemoryFill(immediateDstPtr, &immediatePattern,
sizeof(immediatePattern),
immediateAllocSize, event->toHandle(), 0, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(1u, event->getPacketsInUse());
EXPECT_EQ(1u, event->getKernelCount());
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList, ptrOffset(commandList->commandContainer.getCommandStream()->getCpuBase(), 0),
commandList->commandContainer.getCommandStream()->getUsed()));
auto itorWalkers = findAll<GPGPU_WALKER *>(cmdList.begin(), cmdList.end());
auto begin = cmdList.begin();
ASSERT_EQ(2u, itorWalkers.size());
auto secondWalker = itorWalkers[1];
validateTimestampRegisters<FamilyType>(cmdList,
begin,
REG_GLOBAL_TIMESTAMP_LDW, globalStartAddress,
GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, contextStartAddress,
false);
validateTimestampRegisters<FamilyType>(cmdList,
secondWalker,
REG_GLOBAL_TIMESTAMP_LDW, globalEndAddress,
GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, contextEndAddress,
false);
}
HWTEST2_F(AppendFillTest,
givenCallToAppendMemoryFillWhenTimestampEventUsesRegistersThenSinglePacketUsesRegisterProfiling, IsBetweenGen9AndGen12lp) {
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER;
ze_event_pool_desc_t eventPoolDesc = {};
eventPoolDesc.count = 1;
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
ze_event_desc_t eventDesc = {};
eventDesc.index = 0;
ze_result_t result = ZE_RESULT_SUCCESS;
auto eventPool = std::unique_ptr<L0::EventPool>(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result));
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto event = std::unique_ptr<L0::Event>(L0::Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
uint64_t globalStartAddress = event->getGpuAddress(device) + event->getGlobalStartOffset();
uint64_t contextStartAddress = event->getGpuAddress(device) + event->getContextStartOffset();
uint64_t globalEndAddress = event->getGpuAddress(device) + event->getGlobalEndOffset();
uint64_t contextEndAddress = event->getGpuAddress(device) + event->getContextEndOffset();
auto commandList = std::make_unique<WhiteBox<MockCommandList<gfxCoreFamily>>>();
commandList->initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
result = commandList->appendMemoryFill(dstPtr, pattern, patternSize, allocSize, event->toHandle(), 0, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(1u, event->getPacketsInUse());
EXPECT_EQ(1u, event->getKernelCount());
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList, ptrOffset(commandList->commandContainer.getCommandStream()->getCpuBase(), 0),
commandList->commandContainer.getCommandStream()->getUsed()));
auto itorWalkers = findAll<GPGPU_WALKER *>(cmdList.begin(), cmdList.end());
auto begin = cmdList.begin();
ASSERT_EQ(2u, itorWalkers.size());
auto secondWalker = itorWalkers[1];
validateTimestampRegisters<FamilyType>(cmdList,
begin,
REG_GLOBAL_TIMESTAMP_LDW, globalStartAddress,
GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, contextStartAddress,
false);
validateTimestampRegisters<FamilyType>(cmdList,
secondWalker,
REG_GLOBAL_TIMESTAMP_LDW, globalEndAddress,
GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, contextEndAddress,
false);
}
HWTEST2_F(AppendFillTest,
givenCallToAppendMemoryFillWithImmediateValueWhenTimestampEventUsesComputeWalkerPostSyncThenSeparateKernelsUsesPostSyncProfiling, IsAtLeastXeHpCore) {
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
using COMPUTE_WALKER = typename GfxFamily::COMPUTE_WALKER;
using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA;
ze_event_pool_desc_t eventPoolDesc = {};
eventPoolDesc.count = 1;
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
ze_event_desc_t eventDesc = {};
eventDesc.index = 0;
ze_result_t result = ZE_RESULT_SUCCESS;
auto eventPool = std::unique_ptr<L0::EventPool>(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result));
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto event = std::unique_ptr<L0::Event>(L0::Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
uint64_t firstKernelEventAddress = event->getGpuAddress(device);
uint64_t secondKernelEventAddress = event->getGpuAddress(device) + event->getSinglePacketSize();
auto commandList = std::make_unique<WhiteBox<MockCommandList<gfxCoreFamily>>>();
commandList->initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
result = commandList->appendMemoryFill(immediateDstPtr, &immediatePattern,
sizeof(immediatePattern),
immediateAllocSize, event->toHandle(), 0, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(2u, event->getPacketsInUse());
EXPECT_EQ(2u, event->getKernelCount());
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList, ptrOffset(commandList->commandContainer.getCommandStream()->getCpuBase(), 0),
commandList->commandContainer.getCommandStream()->getUsed()));
auto itorWalkers = findAll<COMPUTE_WALKER *>(cmdList.begin(), cmdList.end());
ASSERT_EQ(2u, itorWalkers.size());
auto firstWalker = itorWalkers[0];
auto secondWalker = itorWalkers[1];
auto walkerCmd = genCmdCast<COMPUTE_WALKER *>(*firstWalker);
EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation());
EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
walkerCmd = genCmdCast<COMPUTE_WALKER *>(*secondWalker);
EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation());
EXPECT_EQ(secondKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
}
HWTEST2_F(AppendFillTest,
givenCallToAppendMemoryFillWhenTimestampEventUsesComputeWalkerPostSyncThenSeparateKernelsUsesPostSyncProfiling, IsAtLeastXeHpCore) {
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
using COMPUTE_WALKER = typename GfxFamily::COMPUTE_WALKER;
using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA;
ze_event_pool_desc_t eventPoolDesc = {};
eventPoolDesc.count = 1;
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
ze_event_desc_t eventDesc = {};
eventDesc.index = 0;
ze_result_t result = ZE_RESULT_SUCCESS;
auto eventPool = std::unique_ptr<L0::EventPool>(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result));
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto event = std::unique_ptr<L0::Event>(L0::Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
uint64_t firstKernelEventAddress = event->getGpuAddress(device);
uint64_t secondKernelEventAddress = event->getGpuAddress(device) + event->getSinglePacketSize();
auto commandList = std::make_unique<WhiteBox<MockCommandList<gfxCoreFamily>>>();
commandList->initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
result = commandList->appendMemoryFill(dstPtr, pattern, patternSize, allocSize, event->toHandle(), 0, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(2u, event->getPacketsInUse());
EXPECT_EQ(2u, event->getKernelCount());
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList, ptrOffset(commandList->commandContainer.getCommandStream()->getCpuBase(), 0),
commandList->commandContainer.getCommandStream()->getUsed()));
auto itorWalkers = findAll<COMPUTE_WALKER *>(cmdList.begin(), cmdList.end());
ASSERT_EQ(2u, itorWalkers.size());
auto firstWalker = itorWalkers[0];
auto secondWalker = itorWalkers[1];
auto walkerCmd = genCmdCast<COMPUTE_WALKER *>(*firstWalker);
EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation());
EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
walkerCmd = genCmdCast<COMPUTE_WALKER *>(*secondWalker);
EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation());
EXPECT_EQ(secondKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
}
} // namespace ult } // namespace ult
} // namespace L0 } // namespace L0

View File

@@ -278,5 +278,295 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenVariousKernelsAndPatchingDisallowe
pCommandList->reset(); pCommandList->reset();
} }
using AppendMemoryCopyXeHpAndLater = Test<DeviceFixture>;
HWTEST2_F(AppendMemoryCopyXeHpAndLater,
givenCommandListWhenTimestampProvidedByComputeWalkerPostSyncPassedToMemoryCopyThenAppendProfilingCalledForThreeSeparateKernels,
IsAtLeastXeHpCore) {
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
using COMPUTE_WALKER = typename GfxFamily::COMPUTE_WALKER;
using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA;
MockAppendMemoryCopy<gfxCoreFamily> commandList;
commandList.appendMemoryCopyKernelWithGACallBase = true;
commandList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
void *srcPtr = reinterpret_cast<void *>(0x1231);
void *dstPtr = reinterpret_cast<void *>(0x200002345);
ze_event_pool_desc_t eventPoolDesc = {};
eventPoolDesc.count = 1;
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
ze_event_desc_t eventDesc = {};
eventDesc.index = 0;
ze_result_t result = ZE_RESULT_SUCCESS;
auto eventPool = std::unique_ptr<L0::EventPool>(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result));
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto event = std::unique_ptr<L0::Event>(L0::Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
uint64_t firstKernelEventAddress = event->getGpuAddress(device);
uint64_t secondKernelEventAddress = event->getGpuAddress(device) + event->getSinglePacketSize();
uint64_t thirdKernelEventAddress = event->getGpuAddress(device) + 2 * event->getSinglePacketSize();
commandList.appendMemoryCopy(dstPtr, srcPtr, 0x100002345, event->toHandle(), 0, nullptr);
EXPECT_EQ(3u, commandList.appendMemoryCopyKernelWithGACalled);
EXPECT_EQ(0u, commandList.appendMemoryCopyBlitCalled);
EXPECT_EQ(3u, event->getPacketsInUse());
EXPECT_EQ(3u, event->getKernelCount());
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList, ptrOffset(commandList.commandContainer.getCommandStream()->getCpuBase(), 0),
commandList.commandContainer.getCommandStream()->getUsed()));
auto itorWalkers = findAll<COMPUTE_WALKER *>(cmdList.begin(), cmdList.end());
ASSERT_EQ(3u, itorWalkers.size());
auto firstWalker = itorWalkers[0];
auto secondWalker = itorWalkers[1];
auto thirdWalker = itorWalkers[2];
auto walkerCmd = genCmdCast<COMPUTE_WALKER *>(*firstWalker);
EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation());
EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
walkerCmd = genCmdCast<COMPUTE_WALKER *>(*secondWalker);
EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation());
EXPECT_EQ(secondKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
walkerCmd = genCmdCast<COMPUTE_WALKER *>(*thirdWalker);
EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation());
EXPECT_EQ(thirdKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
}
HWTEST2_F(AppendMemoryCopyXeHpAndLater,
givenMultiTileCommandListWhenTimestampProvidedByComputeWalkerPostSyncPassedToMemoryCopyThenAppendProfilingCalledForThreeSeparateMultiTileKernels,
IsAtLeastXeHpCore) {
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
using COMPUTE_WALKER = typename GfxFamily::COMPUTE_WALKER;
using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA;
MockAppendMemoryCopy<gfxCoreFamily> commandList;
commandList.appendMemoryCopyKernelWithGACallBase = true;
commandList.partitionCount = 2;
commandList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
void *srcPtr = reinterpret_cast<void *>(0x1231);
void *dstPtr = reinterpret_cast<void *>(0x200002345);
ze_event_pool_desc_t eventPoolDesc = {};
eventPoolDesc.count = 1;
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
ze_event_desc_t eventDesc = {};
eventDesc.index = 0;
ze_result_t result = ZE_RESULT_SUCCESS;
auto eventPool = std::unique_ptr<L0::EventPool>(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result));
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto event = std::unique_ptr<L0::Event>(L0::Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
uint64_t firstKernelEventAddress = event->getGpuAddress(device);
uint64_t secondKernelEventAddress = event->getGpuAddress(device) + 2 * event->getSinglePacketSize();
uint64_t thirdKernelEventAddress = event->getGpuAddress(device) + 4 * event->getSinglePacketSize();
commandList.appendMemoryCopy(dstPtr, srcPtr, 0x100002345, event->toHandle(), 0, nullptr);
EXPECT_EQ(3u, commandList.appendMemoryCopyKernelWithGACalled);
EXPECT_EQ(0u, commandList.appendMemoryCopyBlitCalled);
EXPECT_EQ(6u, event->getPacketsInUse());
EXPECT_EQ(3u, event->getKernelCount());
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList, ptrOffset(commandList.commandContainer.getCommandStream()->getCpuBase(), 0),
commandList.commandContainer.getCommandStream()->getUsed()));
auto itorWalkers = findAll<COMPUTE_WALKER *>(cmdList.begin(), cmdList.end());
ASSERT_EQ(3u, itorWalkers.size());
auto firstWalker = itorWalkers[0];
auto secondWalker = itorWalkers[1];
auto thirdWalker = itorWalkers[2];
auto walkerCmd = genCmdCast<COMPUTE_WALKER *>(*firstWalker);
EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation());
EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
walkerCmd = genCmdCast<COMPUTE_WALKER *>(*secondWalker);
EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation());
EXPECT_EQ(secondKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
walkerCmd = genCmdCast<COMPUTE_WALKER *>(*thirdWalker);
EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation());
EXPECT_EQ(thirdKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
}
HWTEST2_F(AppendMemoryCopyXeHpAndLater,
givenCommandListAndEventWithSignalScopeWhenTimestampProvidedByComputeWalkerPostSyncPassedToMemoryCopyThenAppendProfilingCalledForThreeSeparateKernelsAndL3FlushWaHandled,
isXeHpOrXeHpgCore) {
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
using COMPUTE_WALKER = typename GfxFamily::COMPUTE_WALKER;
using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA;
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION;
MockAppendMemoryCopy<gfxCoreFamily> commandList;
commandList.appendMemoryCopyKernelWithGACallBase = true;
commandList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
void *srcPtr = reinterpret_cast<void *>(0x1231);
void *dstPtr = reinterpret_cast<void *>(0x200002345);
ze_event_pool_desc_t eventPoolDesc = {};
eventPoolDesc.count = 1;
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
ze_event_desc_t eventDesc = {};
eventDesc.index = 0;
eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
ze_result_t result = ZE_RESULT_SUCCESS;
auto eventPool = std::unique_ptr<L0::EventPool>(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result));
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto event = std::unique_ptr<L0::Event>(L0::Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
uint64_t firstKernelEventAddress = event->getGpuAddress(device);
uint64_t secondKernelEventAddress = event->getGpuAddress(device) + 2 * event->getSinglePacketSize();
uint64_t thirdKernelEventAddress = event->getGpuAddress(device) + 4 * event->getSinglePacketSize();
commandList.appendMemoryCopy(dstPtr, srcPtr, 0x100002345, event->toHandle(), 0, nullptr);
EXPECT_EQ(3u, commandList.appendMemoryCopyKernelWithGACalled);
EXPECT_EQ(0u, commandList.appendMemoryCopyBlitCalled);
EXPECT_EQ(6u, event->getPacketsInUse());
EXPECT_EQ(3u, event->getKernelCount());
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList, ptrOffset(commandList.commandContainer.getCommandStream()->getCpuBase(), 0),
commandList.commandContainer.getCommandStream()->getUsed()));
auto itorWalkers = findAll<COMPUTE_WALKER *>(cmdList.begin(), cmdList.end());
ASSERT_EQ(3u, itorWalkers.size());
auto firstWalker = itorWalkers[0];
auto secondWalker = itorWalkers[1];
auto thirdWalker = itorWalkers[2];
auto walkerCmd = genCmdCast<COMPUTE_WALKER *>(*firstWalker);
EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation());
EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
walkerCmd = genCmdCast<COMPUTE_WALKER *>(*secondWalker);
EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation());
EXPECT_EQ(secondKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
walkerCmd = genCmdCast<COMPUTE_WALKER *>(*thirdWalker);
EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation());
EXPECT_EQ(thirdKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
auto itorPipeControls = findAll<PIPE_CONTROL *>(cmdList.begin(), cmdList.end());
uint64_t eventGpuAddress = firstKernelEventAddress + event->getSinglePacketSize();
if (event->isUsingContextEndOffset()) {
eventGpuAddress += event->getContextEndOffset();
}
uint32_t postSyncPipeControls = 0;
for (auto it : itorPipeControls) {
auto cmd = genCmdCast<PIPE_CONTROL *>(*it);
if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) {
EXPECT_EQ(cmd->getImmediateData(), Event::STATE_SIGNALED);
EXPECT_TRUE(cmd->getCommandStreamerStallEnable());
EXPECT_FALSE(cmd->getWorkloadPartitionIdOffsetEnable());
EXPECT_TRUE(cmd->getDcFlushEnable());
EXPECT_EQ(eventGpuAddress, NEO::UnitTestHelper<FamilyType>::getPipeControlPostSyncAddress(*cmd));
postSyncPipeControls++;
eventGpuAddress += (2 * event->getSinglePacketSize());
}
}
EXPECT_EQ(3u, postSyncPipeControls);
}
HWTEST2_F(AppendMemoryCopyXeHpAndLater,
givenMultiTileCommandListAndEventWithSignalScopeWhenTimestampProvidedByComputeWalkerPostSyncPassedToMemoryCopyThenAppendProfilingCalledForThreeSeparateMultiTileKernelsAndL3FlushWaHandled,
isXeHpOrXeHpgCore) {
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
using COMPUTE_WALKER = typename GfxFamily::COMPUTE_WALKER;
using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA;
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION;
MockAppendMemoryCopy<gfxCoreFamily> commandList;
commandList.appendMemoryCopyKernelWithGACallBase = true;
commandList.partitionCount = 2;
commandList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
void *srcPtr = reinterpret_cast<void *>(0x1231);
void *dstPtr = reinterpret_cast<void *>(0x200002345);
ze_event_pool_desc_t eventPoolDesc = {};
eventPoolDesc.count = 1;
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
ze_event_desc_t eventDesc = {};
eventDesc.index = 0;
eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
ze_result_t result = ZE_RESULT_SUCCESS;
auto eventPool = std::unique_ptr<L0::EventPool>(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result));
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto event = std::unique_ptr<L0::Event>(L0::Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
uint64_t firstKernelEventAddress = event->getGpuAddress(device);
uint64_t secondKernelEventAddress = event->getGpuAddress(device) + 4 * event->getSinglePacketSize();
uint64_t thirdKernelEventAddress = event->getGpuAddress(device) + 8 * event->getSinglePacketSize();
commandList.appendMemoryCopy(dstPtr, srcPtr, 0x100002345, event->toHandle(), 0, nullptr);
EXPECT_EQ(3u, commandList.appendMemoryCopyKernelWithGACalled);
EXPECT_EQ(0u, commandList.appendMemoryCopyBlitCalled);
EXPECT_EQ(12u, event->getPacketsInUse());
EXPECT_EQ(3u, event->getKernelCount());
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList, ptrOffset(commandList.commandContainer.getCommandStream()->getCpuBase(), 0),
commandList.commandContainer.getCommandStream()->getUsed()));
auto itorWalkers = findAll<COMPUTE_WALKER *>(cmdList.begin(), cmdList.end());
ASSERT_EQ(3u, itorWalkers.size());
auto firstWalker = itorWalkers[0];
auto secondWalker = itorWalkers[1];
auto thirdWalker = itorWalkers[2];
auto walkerCmd = genCmdCast<COMPUTE_WALKER *>(*firstWalker);
EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation());
EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
walkerCmd = genCmdCast<COMPUTE_WALKER *>(*secondWalker);
EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation());
EXPECT_EQ(secondKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
walkerCmd = genCmdCast<COMPUTE_WALKER *>(*thirdWalker);
EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation());
EXPECT_EQ(thirdKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
auto itorPipeControls = findAll<PIPE_CONTROL *>(cmdList.begin(), cmdList.end());
uint64_t eventGpuAddress = firstKernelEventAddress + 2 * event->getSinglePacketSize();
if (event->isUsingContextEndOffset()) {
eventGpuAddress += event->getContextEndOffset();
}
uint32_t postSyncPipeControls = 0;
for (auto it : itorPipeControls) {
auto cmd = genCmdCast<PIPE_CONTROL *>(*it);
if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) {
EXPECT_EQ(cmd->getImmediateData(), Event::STATE_SIGNALED);
EXPECT_TRUE(cmd->getCommandStreamerStallEnable());
EXPECT_TRUE(cmd->getWorkloadPartitionIdOffsetEnable());
EXPECT_TRUE(cmd->getDcFlushEnable());
EXPECT_EQ(eventGpuAddress, NEO::UnitTestHelper<FamilyType>::getPipeControlPostSyncAddress(*cmd));
postSyncPipeControls++;
eventGpuAddress += (4 * event->getSinglePacketSize());
}
}
EXPECT_EQ(3u, postSyncPipeControls);
}
} // namespace ult } // namespace ult
} // namespace L0 } // namespace L0

View File

@@ -551,15 +551,27 @@ TEST_F(EventCreate, givenEventWhenSignaledAndResetFromTheHostThenCorrectDataAndO
auto eventPool = std::unique_ptr<L0::EventPool>(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result)); auto eventPool = std::unique_ptr<L0::EventPool>(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result));
EXPECT_EQ(ZE_RESULT_SUCCESS, result); EXPECT_EQ(ZE_RESULT_SUCCESS, result);
ASSERT_NE(nullptr, eventPool); ASSERT_NE(nullptr, eventPool);
auto event = std::unique_ptr<L0::Event>(L0::Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
auto &l0HwHelper = L0HwHelper::get(device->getHwInfo().platform.eRenderCoreFamily);
auto event = std::unique_ptr<L0::Event>(l0HwHelper.createEvent(eventPool.get(), &eventDesc, device));
ASSERT_NE(nullptr, event); ASSERT_NE(nullptr, event);
if (L0HwHelper::get(device->getHwInfo().platform.eRenderCoreFamily).multiTileCapablePlatform()) { if (l0HwHelper.multiTileCapablePlatform()) {
EXPECT_TRUE(event->isUsingContextEndOffset()); EXPECT_TRUE(event->isUsingContextEndOffset());
} else { } else {
EXPECT_FALSE(event->isUsingContextEndOffset()); EXPECT_FALSE(event->isUsingContextEndOffset());
} }
uint32_t *eventCompletionMemory = reinterpret_cast<uint32_t *>(event->getHostAddress());
if (event->isUsingContextEndOffset()) {
eventCompletionMemory = ptrOffset(eventCompletionMemory, event->getContextEndOffset());
}
uint32_t maxPacketsCount = EventPacketsCount::maxKernelSplit * NEO::TimestampPacketSizeControl::preferredPacketCount;
for (uint32_t i = 0; i < maxPacketsCount; i++) {
EXPECT_EQ(Event::STATE_INITIAL, *eventCompletionMemory);
eventCompletionMemory = ptrOffset(eventCompletionMemory, event->getSinglePacketSize());
}
result = event->queryStatus(); result = event->queryStatus();
EXPECT_EQ(ZE_RESULT_NOT_READY, result); EXPECT_EQ(ZE_RESULT_NOT_READY, result);
@@ -1064,7 +1076,7 @@ TEST_F(TimestampEventCreate, givenEventTimestampsCreatedWhenResetIsInvokeThenCor
EXPECT_EQ(1u, event->kernelEventCompletionData[j].getPacketsUsed()); EXPECT_EQ(1u, event->kernelEventCompletionData[j].getPacketsUsed());
} }
EXPECT_EQ(1u, event->kernelCount); EXPECT_EQ(1u, event->getKernelCount());
} }
TEST_F(TimestampEventCreate, givenSingleTimestampEventThenAllocationSizeCreatedForAllTimestamps) { TEST_F(TimestampEventCreate, givenSingleTimestampEventThenAllocationSizeCreatedForAllTimestamps) {
@@ -1093,13 +1105,13 @@ TEST_F(TimestampEventCreate, givenEventTimestampWhenPacketCountIsSetThenCorrectO
gpuAddr += (4u * event->getSinglePacketSize()); gpuAddr += (4u * event->getSinglePacketSize());
event->kernelCount = 2; event->increaseKernelCount();
event->setPacketsInUse(2u); event->setPacketsInUse(2u);
EXPECT_EQ(6u, event->getPacketsInUse()); EXPECT_EQ(6u, event->getPacketsInUse());
EXPECT_EQ(gpuAddr, event->getPacketAddress(device)); EXPECT_EQ(gpuAddr, event->getPacketAddress(device));
gpuAddr += (2u * event->getSinglePacketSize()); gpuAddr += (2u * event->getSinglePacketSize());
event->kernelCount = 3; event->increaseKernelCount();
EXPECT_EQ(gpuAddr, event->getPacketAddress(device)); EXPECT_EQ(gpuAddr, event->getPacketAddress(device));
EXPECT_EQ(7u, event->getPacketsInUse()); EXPECT_EQ(7u, event->getPacketsInUse());
} }
@@ -1122,7 +1134,7 @@ TEST_F(TimestampEventCreate, givenEventWhenSignaledAndResetFromTheHostThenCorrec
} }
EXPECT_EQ(1u, event->kernelEventCompletionData[j].getPacketsUsed()); EXPECT_EQ(1u, event->kernelEventCompletionData[j].getPacketsUsed());
} }
EXPECT_EQ(1u, event->kernelCount); EXPECT_EQ(1u, event->getKernelCount());
} }
TEST_F(TimestampEventCreate, givenpCountZeroCallingQueryTimestampExpThenpCountSetProperly) { TEST_F(TimestampEventCreate, givenpCountZeroCallingQueryTimestampExpThenpCountSetProperly) {

View File

@@ -385,6 +385,42 @@ HWTEST_F(PipeControlHelperTests, WhenIsDcFlushAllowedIsCalledThenCorrectResultIs
EXPECT_EQ(hwInfoConfig.isDcFlushAllowed(), MemorySynchronizationCommands<FamilyType>::getDcFlushEnable(true, *defaultHwInfo)); EXPECT_EQ(hwInfoConfig.isDcFlushAllowed(), MemorySynchronizationCommands<FamilyType>::getDcFlushEnable(true, *defaultHwInfo));
} }
HWTEST_F(PipeControlHelperTests, WhenPipeControlPostSyncTimestampUsedThenCorrectPostSyncUsed) {
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
std::unique_ptr<uint8_t> buffer(new uint8_t[128]);
LinearStream stream(buffer.get(), 128);
uint64_t address = 0x1234567887654320;
uint64_t immediateData = 0x0;
PipeControlArgs args;
MemorySynchronizationCommands<FamilyType>::addPipeControlWithPostSync(
stream, PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP, address, immediateData, args);
auto pipeControl = genCmdCast<PIPE_CONTROL *>(stream.getCpuBase());
ASSERT_NE(nullptr, pipeControl);
EXPECT_EQ(address, NEO::UnitTestHelper<FamilyType>::getPipeControlPostSyncAddress(*pipeControl));
EXPECT_EQ(immediateData, pipeControl->getImmediateData());
EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP, pipeControl->getPostSyncOperation());
}
HWTEST_F(PipeControlHelperTests, WhenPipeControlPostSyncWriteImmediateDataUsedThenCorrectPostSyncUsed) {
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
std::unique_ptr<uint8_t> buffer(new uint8_t[128]);
LinearStream stream(buffer.get(), 128);
uint64_t address = 0x1234567887654320;
uint64_t immediateData = 0x1234;
PipeControlArgs args;
MemorySynchronizationCommands<FamilyType>::addPipeControlWithPostSync(
stream, PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, address, immediateData, args);
auto pipeControl = genCmdCast<PIPE_CONTROL *>(stream.getCpuBase());
ASSERT_NE(nullptr, pipeControl);
EXPECT_EQ(address, NEO::UnitTestHelper<FamilyType>::getPipeControlPostSyncAddress(*pipeControl));
EXPECT_EQ(immediateData, pipeControl->getImmediateData());
EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, pipeControl->getPostSyncOperation());
}
TEST(HwInfoTest, givenHwInfoWhenChosenEngineTypeQueriedThenDefaultIsReturned) { TEST(HwInfoTest, givenHwInfoWhenChosenEngineTypeQueriedThenDefaultIsReturned) {
HardwareInfo hwInfo = *defaultHwInfo; HardwareInfo hwInfo = *defaultHwInfo;
hwInfo.capabilityTable.defaultEngineType = aub_stream::ENGINE_RCS; hwInfo.capabilityTable.defaultEngineType = aub_stream::ENGINE_RCS;

View File

@@ -75,6 +75,10 @@ struct UnitTestHelper {
static void adjustKernelDescriptorForImplicitArgs(KernelDescriptor &kernelDescriptor); static void adjustKernelDescriptorForImplicitArgs(KernelDescriptor &kernelDescriptor);
static std::vector<bool> getProgrammedLargeGrfValues(CommandStreamReceiver &csr, LinearStream &linearStream); static std::vector<bool> getProgrammedLargeGrfValues(CommandStreamReceiver &csr, LinearStream &linearStream);
static bool getWorkloadPartitionForStoreRegisterMemCmd(typename GfxFamily::MI_STORE_REGISTER_MEM &storeRegisterMem);
static bool timestampRegisterHighAddress();
}; };
} // namespace NEO } // namespace NEO

View File

@@ -70,4 +70,9 @@ inline uint64_t UnitTestHelper<GfxFamily>::getPipeControlPostSyncAddress(const t
return (gpuAddressHigh << 32) | gpuAddress; return (gpuAddressHigh << 32) | gpuAddress;
} }
template <typename GfxFamily>
bool UnitTestHelper<GfxFamily>::timestampRegisterHighAddress() {
return false;
}
} // namespace NEO } // namespace NEO

View File

@@ -72,4 +72,9 @@ std::vector<bool> UnitTestHelper<GfxFamily>::getProgrammedLargeGrfValues(Command
return {}; return {};
} }
template <typename GfxFamily>
inline bool UnitTestHelper<GfxFamily>::getWorkloadPartitionForStoreRegisterMemCmd(typename GfxFamily::MI_STORE_REGISTER_MEM &storeRegisterMem) {
return false;
}
} // namespace NEO } // namespace NEO

View File

@@ -100,4 +100,9 @@ std::vector<bool> UnitTestHelper<GfxFamily>::getProgrammedLargeGrfValues(Command
return largeGrfValues; return largeGrfValues;
} }
template <typename GfxFamily>
inline bool UnitTestHelper<GfxFamily>::getWorkloadPartitionForStoreRegisterMemCmd(typename GfxFamily::MI_STORE_REGISTER_MEM &storeRegisterMem) {
return storeRegisterMem.getWorkloadPartitionIdOffsetEnable();
}
} // namespace NEO } // namespace NEO

View File

@@ -34,6 +34,7 @@ using IsAtMostXeHpgCore = IsAtMostGfxCore<IGFX_XE_HPG_CORE>;
using IsAtLeastXeHpcCore = IsAtLeastGfxCore<IGFX_XE_HPC_CORE>; using IsAtLeastXeHpcCore = IsAtLeastGfxCore<IGFX_XE_HPC_CORE>;
using IsAtMostXeHpcCore = IsAtMostGfxCore<IGFX_XE_HPC_CORE>; using IsAtMostXeHpcCore = IsAtMostGfxCore<IGFX_XE_HPC_CORE>;
using isXeHpOrXeHpgCore = IsAnyGfxCores<IGFX_XE_HP_CORE, IGFX_XE_HPG_CORE>;
using isXeHpOrXeHpcCore = IsAnyGfxCores<IGFX_XE_HP_CORE, IGFX_XE_HPC_CORE>; using isXeHpOrXeHpcCore = IsAnyGfxCores<IGFX_XE_HP_CORE, IGFX_XE_HPC_CORE>;
using isXeHpcOrXeHpgCore = IsAnyGfxCores<IGFX_XE_HPC_CORE, IGFX_XE_HPG_CORE>; using isXeHpcOrXeHpgCore = IsAnyGfxCores<IGFX_XE_HPC_CORE, IGFX_XE_HPG_CORE>;