[L0][XE_HPC]Perform memcpy on CPU for non-usm ptrs
Related-To: NEO-7237 If size is small enough, it is more efficient to perform copy through locked ptr on CPU. This change also introduces experimental flag to enable this. Signed-off-by: Szymon Morek <szymon.morek@intel.com>
This commit is contained in:
parent
6c1504a0f4
commit
ec04de61a7
|
@ -323,6 +323,9 @@ struct CommandList : _ze_command_list_handle_t {
|
|||
bool multiReturnPointCommandList = false;
|
||||
bool systolicModeSupport = false;
|
||||
bool pipelineSelectStateTracking = false;
|
||||
|
||||
std::atomic<uint32_t> barrierCounter{0u};
|
||||
uint32_t latestFlushedBarrierCounter = 0u;
|
||||
};
|
||||
|
||||
using CommandListAllocatorFn = CommandList *(*)(uint32_t);
|
||||
|
|
|
@ -2492,7 +2492,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendBarrier(ze_event_handle_
|
|||
}
|
||||
|
||||
appendSignalEventPostWalker(signalEvent, workloadPartition);
|
||||
|
||||
this->barrierCounter++;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
|
|
|
@ -9,6 +9,10 @@
|
|||
|
||||
#include "level_zero/core/source/cmdlist/cmdlist_hw.h"
|
||||
|
||||
namespace NEO {
|
||||
struct SvmAllocationData;
|
||||
}
|
||||
|
||||
namespace L0 {
|
||||
|
||||
struct EventPool;
|
||||
|
@ -123,6 +127,11 @@ struct CommandListCoreFamilyImmediate : public CommandListCoreFamily<gfxCoreFami
|
|||
|
||||
void createLogicalStateHelper() override {}
|
||||
NEO::LogicalStateHelper *getLogicalStateHelper() const override;
|
||||
|
||||
bool preferCopyThroughLockedPtr(NEO::SvmAllocationData *dstAlloc, bool dstFound, NEO::SvmAllocationData *srcAlloc, bool srcFound, size_t size);
|
||||
bool isAllocUSMDeviceMemory(NEO::SvmAllocationData *alloc, bool allocFound);
|
||||
ze_result_t performCpuMemcpy(void *dstptr, const void *srcptr, size_t size, bool isDstDeviceMemory, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents);
|
||||
void *obtainLockedPtrFromDevice(void *ptr, size_t size);
|
||||
};
|
||||
|
||||
template <PRODUCT_FAMILY gfxProductFamily>
|
||||
|
|
|
@ -13,6 +13,7 @@
|
|||
#include "shared/source/helpers/logical_state_helper.h"
|
||||
#include "shared/source/memory_manager/internal_allocation_storage.h"
|
||||
#include "shared/source/memory_manager/prefetch_manager.h"
|
||||
#include "shared/source/memory_manager/unified_memory_manager.h"
|
||||
|
||||
#include "level_zero/core/source/cmdlist/cmdlist_hw_immediate.h"
|
||||
#include "level_zero/core/source/device/bcs_split.h"
|
||||
|
@ -227,6 +228,14 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopy(
|
|||
|
||||
ze_result_t ret;
|
||||
|
||||
NEO::SvmAllocationData *srcAllocData = nullptr;
|
||||
NEO::SvmAllocationData *dstAllocData = nullptr;
|
||||
bool srcAllocFound = this->device->getDriverHandle()->findAllocationDataForRange(const_cast<void *>(srcptr), size, &srcAllocData);
|
||||
bool dstAllocFound = this->device->getDriverHandle()->findAllocationDataForRange(dstptr, size, &dstAllocData);
|
||||
if (preferCopyThroughLockedPtr(dstAllocData, dstAllocFound, srcAllocData, srcAllocFound, size)) {
|
||||
return performCpuMemcpy(dstptr, srcptr, size, dstAllocFound, hSignalEvent, numWaitEvents, phWaitEvents);
|
||||
}
|
||||
|
||||
if (this->isAppendSplitNeeded(dstptr, srcptr, size)) {
|
||||
ret = static_cast<DeviceImp *>(this->device)->bcsSplit.appendSplitCall<gfxCoreFamily, void *, const void *>(this, dstptr, srcptr, size, hSignalEvent, [&](void *dstptrParam, const void *srcptrParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) {
|
||||
return CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(dstptrParam, srcptrParam, sizeParam, hSignalEventParam, numWaitEvents, phWaitEvents);
|
||||
|
@ -461,4 +470,91 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::flushImmediate(ze_res
|
|||
return inputRet;
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
bool CommandListCoreFamilyImmediate<gfxCoreFamily>::preferCopyThroughLockedPtr(NEO::SvmAllocationData *dstAlloc, bool dstFound, NEO::SvmAllocationData *srcAlloc, bool srcFound, size_t size) {
|
||||
size_t h2DThreshold = 2 * MemoryConstants::megaByte;
|
||||
size_t d2HThreshold = 1 * MemoryConstants::kiloByte;
|
||||
if (NEO::DebugManager.flags.ExperimentalH2DCpuCopyThreshold.get() != -1) {
|
||||
h2DThreshold = NEO::DebugManager.flags.ExperimentalH2DCpuCopyThreshold.get();
|
||||
}
|
||||
if (NEO::DebugManager.flags.ExperimentalD2HCpuCopyThreshold.get() != -1) {
|
||||
d2HThreshold = NEO::DebugManager.flags.ExperimentalD2HCpuCopyThreshold.get();
|
||||
}
|
||||
if (NEO::HwHelper::get(this->device->getHwInfo().platform.eRenderCoreFamily).copyThroughLockedPtrEnabled()) {
|
||||
return (!srcFound && isAllocUSMDeviceMemory(dstAlloc, dstFound) && size <= h2DThreshold) ||
|
||||
(!dstFound && isAllocUSMDeviceMemory(srcAlloc, srcFound) && size <= d2HThreshold);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
bool CommandListCoreFamilyImmediate<gfxCoreFamily>::isAllocUSMDeviceMemory(NEO::SvmAllocationData *alloc, bool allocFound) {
|
||||
return allocFound && (alloc->memoryType == InternalMemoryType::DEVICE_UNIFIED_MEMORY);
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::performCpuMemcpy(void *dstptr, const void *srcptr, size_t size, bool isDstDeviceMemory, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) {
|
||||
|
||||
bool needsBarrier = (numWaitEvents > 0);
|
||||
if (needsBarrier) {
|
||||
this->appendBarrier(nullptr, numWaitEvents, phWaitEvents);
|
||||
}
|
||||
|
||||
bool needsFlushTagUpdate = this->latestFlushedBarrierCounter < this->barrierCounter;
|
||||
if (needsFlushTagUpdate) {
|
||||
this->csr->flushTagUpdate();
|
||||
}
|
||||
|
||||
Event *signalEvent = nullptr;
|
||||
if (hSignalEvent) {
|
||||
signalEvent = Event::fromHandle(hSignalEvent);
|
||||
}
|
||||
|
||||
const void *cpuMemcpySrcPtr = nullptr;
|
||||
void *cpuMemcpyDstPtr = nullptr;
|
||||
if (isDstDeviceMemory) {
|
||||
cpuMemcpySrcPtr = srcptr;
|
||||
cpuMemcpyDstPtr = obtainLockedPtrFromDevice(dstptr, size);
|
||||
} else {
|
||||
cpuMemcpySrcPtr = obtainLockedPtrFromDevice(const_cast<void *>(srcptr), size);
|
||||
cpuMemcpyDstPtr = dstptr;
|
||||
}
|
||||
|
||||
if (needsFlushTagUpdate) {
|
||||
auto timeoutMicroseconds = NEO::TimeoutControls::maxTimeout;
|
||||
const auto waitStatus = this->csr->waitForCompletionWithTimeout(NEO::WaitParams{false, false, timeoutMicroseconds}, this->csr->peekTaskCount());
|
||||
if (waitStatus == NEO::WaitStatus::GpuHang) {
|
||||
return ZE_RESULT_ERROR_DEVICE_LOST;
|
||||
}
|
||||
this->latestFlushedBarrierCounter = this->barrierCounter;
|
||||
}
|
||||
|
||||
if (signalEvent) {
|
||||
signalEvent->setGpuStartTimestamp();
|
||||
}
|
||||
|
||||
memcpy_s(cpuMemcpyDstPtr, size, cpuMemcpySrcPtr, size);
|
||||
|
||||
if (signalEvent) {
|
||||
signalEvent->setGpuEndTimestamp();
|
||||
signalEvent->hostSignal();
|
||||
}
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
void *CommandListCoreFamilyImmediate<gfxCoreFamily>::obtainLockedPtrFromDevice(void *ptr, size_t size) {
|
||||
NEO::SvmAllocationData *allocData = nullptr;
|
||||
auto allocFound = this->device->getDriverHandle()->findAllocationDataForRange(ptr, size, &allocData);
|
||||
UNRECOVERABLE_IF(!allocFound);
|
||||
|
||||
auto alloc = allocData->gpuAllocations.getGraphicsAllocation(this->device->getRootDeviceIndex());
|
||||
if (!alloc->isLocked()) {
|
||||
this->device->getDriverHandle()->getMemoryManager()->lockResource(alloc);
|
||||
}
|
||||
auto gpuAddress = allocData->gpuAllocations.getGraphicsAllocation(this->device->getRootDeviceIndex())->getGpuAddress();
|
||||
auto offset = ptrDiff(ptr, gpuAddress);
|
||||
return ptrOffset(alloc->getLockedPtr(), offset);
|
||||
}
|
||||
|
||||
} // namespace L0
|
||||
|
|
|
@ -65,6 +65,8 @@ struct Event : _ze_event_handle_t {
|
|||
void *getHostAddress() { return hostAddress; }
|
||||
virtual void setPacketsInUse(uint32_t value) = 0;
|
||||
uint32_t getCurrKernelDataIndex() const { return kernelCount - 1; }
|
||||
virtual void setGpuStartTimestamp() = 0;
|
||||
virtual void setGpuEndTimestamp() = 0;
|
||||
|
||||
size_t getContextStartOffset() const {
|
||||
return contextStartOffset;
|
||||
|
@ -143,6 +145,10 @@ struct Event : _ze_event_handle_t {
|
|||
size_t singlePacketSize = 0u;
|
||||
size_t eventPoolOffset = 0u;
|
||||
|
||||
size_t cpuStartTimestamp = 0u;
|
||||
size_t gpuStartTimestamp = 0u;
|
||||
size_t gpuEndTimestamp = 0u;
|
||||
|
||||
uint32_t kernelCount = 1u;
|
||||
|
||||
bool isTimestampEvent = false;
|
||||
|
@ -195,6 +201,8 @@ struct EventImp : public Event {
|
|||
uint32_t getPacketsInUse() override;
|
||||
uint32_t getPacketsUsedInLastKernel() override;
|
||||
void setPacketsInUse(uint32_t value) override;
|
||||
void setGpuStartTimestamp() override;
|
||||
void setGpuEndTimestamp() override;
|
||||
|
||||
std::unique_ptr<KernelEventCompletionData<TagSizeT>[]> kernelEventCompletionData;
|
||||
|
||||
|
|
|
@ -7,6 +7,7 @@
|
|||
|
||||
#include "shared/source/debug_settings/debug_settings_manager.h"
|
||||
#include "shared/source/memory_manager/internal_allocation_storage.h"
|
||||
#include "shared/source/os_interface/os_time.h"
|
||||
|
||||
#include "level_zero/core/source/event/event.h"
|
||||
#include "level_zero/core/source/hw_helpers/l0_hw_helper.h"
|
||||
|
@ -167,18 +168,24 @@ template <typename TagSizeT>
|
|||
ze_result_t EventImp<TagSizeT>::hostEventSetValueTimestamps(TagSizeT eventVal) {
|
||||
|
||||
auto baseAddr = castToUint64(hostAddress);
|
||||
|
||||
auto eventTsSetFunc = [&eventVal](auto tsAddr) {
|
||||
auto eventTsSetFunc = [](auto tsAddr, TagSizeT value) {
|
||||
auto tsptr = reinterpret_cast<void *>(tsAddr);
|
||||
memcpy_s(tsptr, sizeof(TagSizeT), static_cast<void *>(&eventVal), sizeof(TagSizeT));
|
||||
memcpy_s(tsptr, sizeof(TagSizeT), static_cast<void *>(&value), sizeof(TagSizeT));
|
||||
};
|
||||
|
||||
TagSizeT timestampStart = eventVal;
|
||||
TagSizeT timestampEnd = eventVal;
|
||||
if (eventVal == Event::STATE_SIGNALED) {
|
||||
timestampStart = static_cast<TagSizeT>(this->gpuStartTimestamp);
|
||||
timestampEnd = static_cast<TagSizeT>(this->gpuEndTimestamp);
|
||||
}
|
||||
for (uint32_t i = 0; i < kernelCount; i++) {
|
||||
uint32_t packetsToSet = kernelEventCompletionData[i].getPacketsUsed();
|
||||
for (uint32_t j = 0; j < packetsToSet; j++) {
|
||||
eventTsSetFunc(baseAddr + contextStartOffset);
|
||||
eventTsSetFunc(baseAddr + globalStartOffset);
|
||||
eventTsSetFunc(baseAddr + contextEndOffset);
|
||||
eventTsSetFunc(baseAddr + globalEndOffset);
|
||||
eventTsSetFunc(baseAddr + contextStartOffset, timestampStart);
|
||||
eventTsSetFunc(baseAddr + globalStartOffset, timestampStart);
|
||||
eventTsSetFunc(baseAddr + contextEndOffset, timestampEnd);
|
||||
eventTsSetFunc(baseAddr + globalEndOffset, timestampEnd);
|
||||
baseAddr += singlePacketSize;
|
||||
}
|
||||
}
|
||||
|
@ -316,7 +323,6 @@ ze_result_t EventImp<TagSizeT>::queryKernelTimestamp(ze_kernel_timestamp_result_
|
|||
eventTsSetFunc(globalEndTS, result.context.kernelEnd);
|
||||
eventTsSetFunc(globalEndTS, result.global.kernelEnd);
|
||||
}
|
||||
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -379,6 +385,9 @@ void EventImp<TagSizeT>::resetPackets() {
|
|||
kernelEventCompletionData[i].setPacketsUsed(1);
|
||||
}
|
||||
kernelCount = 1;
|
||||
cpuStartTimestamp = 0;
|
||||
gpuStartTimestamp = 0;
|
||||
gpuEndTimestamp = 0;
|
||||
}
|
||||
|
||||
template <typename TagSizeT>
|
||||
|
@ -410,4 +419,21 @@ uint64_t EventImp<TagSizeT>::getPacketAddress(Device *device) {
|
|||
return address;
|
||||
}
|
||||
|
||||
template <typename TagSizeT>
|
||||
void EventImp<TagSizeT>::setGpuStartTimestamp() {
|
||||
if (isEventTimestampFlagSet()) {
|
||||
this->device->getGlobalTimestamps(&cpuStartTimestamp, &gpuStartTimestamp);
|
||||
cpuStartTimestamp = cpuStartTimestamp / this->device->getNEODevice()->getDeviceInfo().outProfilingTimerResolution;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename TagSizeT>
|
||||
void EventImp<TagSizeT>::setGpuEndTimestamp() {
|
||||
if (isEventTimestampFlagSet()) {
|
||||
auto resolution = this->device->getNEODevice()->getDeviceInfo().outProfilingTimerResolution;
|
||||
auto cpuEndTimestamp = this->device->getNEODevice()->getOSTime()->getCpuRawTimestamp() / resolution;
|
||||
this->gpuEndTimestamp = gpuStartTimestamp + (cpuEndTimestamp - cpuStartTimestamp);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace L0
|
||||
|
|
|
@ -67,8 +67,11 @@ struct Mock<EventPool> : public EventPool {
|
|||
|
||||
class MockEvent : public ::L0::Event {
|
||||
public:
|
||||
using ::L0::Event::gpuEndTimestamp;
|
||||
using ::L0::Event::gpuStartTimestamp;
|
||||
using ::L0::Event::isCompleted;
|
||||
using ::L0::Event::l3FlushAppliedOnKernel;
|
||||
|
||||
MockEvent() {
|
||||
mockAllocation.reset(new NEO::MockGraphicsAllocation(0,
|
||||
NEO::AllocationType::INTERNAL_HOST_MEMORY,
|
||||
|
@ -119,7 +122,8 @@ class MockEvent : public ::L0::Event {
|
|||
void resetPackets() override {}
|
||||
void setPacketsInUse(uint32_t value) override {}
|
||||
uint64_t getPacketAddress(L0::Device *) override { return 0; }
|
||||
|
||||
void setGpuStartTimestamp() override {}
|
||||
void setGpuEndTimestamp() override {}
|
||||
std::unique_ptr<NEO::GraphicsAllocation> mockAllocation;
|
||||
};
|
||||
|
||||
|
|
|
@ -2202,6 +2202,9 @@ HWTEST_F(EventTests,
|
|||
}
|
||||
|
||||
struct MockEventCompletion : public EventImp<uint32_t> {
|
||||
using EventImp<uint32_t>::gpuStartTimestamp;
|
||||
using EventImp<uint32_t>::gpuEndTimestamp;
|
||||
|
||||
MockEventCompletion(L0::EventPool *eventPool, int index, L0::Device *device) : EventImp(eventPool, index, device) {
|
||||
auto neoDevice = device->getNEODevice();
|
||||
kernelEventCompletionData = std::make_unique<KernelEventCompletionData<uint32_t>[]>(EventPacketsCount::maxKernelSplit);
|
||||
|
@ -2260,5 +2263,14 @@ TEST_F(EventTests, WhenQueryingStatusAfterResetThenAccessMemory) {
|
|||
EXPECT_EQ(event->assignKernelEventCompletionDataCounter, 2u);
|
||||
}
|
||||
|
||||
TEST_F(EventTests, WhenResetEventThenZeroCpuTimestamps) {
|
||||
auto event = std::make_unique<MockEventCompletion>(eventPool, 1u, device);
|
||||
event->gpuStartTimestamp = 10u;
|
||||
event->gpuEndTimestamp = 20u;
|
||||
EXPECT_EQ(event->reset(), ZE_RESULT_SUCCESS);
|
||||
EXPECT_EQ(event->gpuStartTimestamp, 0u);
|
||||
EXPECT_EQ(event->gpuEndTimestamp, 0u);
|
||||
}
|
||||
|
||||
} // namespace ult
|
||||
} // namespace L0
|
||||
|
|
|
@ -8,7 +8,9 @@
|
|||
#include "shared/source/os_interface/hw_info_config.h"
|
||||
#include "shared/test/common/cmd_parse/gen_cmd_parse.h"
|
||||
#include "shared/test/common/helpers/debug_manager_state_restore.h"
|
||||
#include "shared/test/common/libult/ult_command_stream_receiver.h"
|
||||
#include "shared/test/common/memory_manager/mock_prefetch_manager.h"
|
||||
#include "shared/test/common/mocks/mock_ostime.h"
|
||||
#include "shared/test/common/test_macros/hw_test.h"
|
||||
|
||||
#include "level_zero/core/source/event/event.h"
|
||||
|
@ -882,5 +884,373 @@ HWTEST2_F(CommandListAppendLaunchKernelXeHpcCore,
|
|||
ASSERT_EQ(result, ZE_RESULT_SUCCESS);
|
||||
}
|
||||
|
||||
struct AppendMemoryLockedCopyFixture : public DeviceFixture {
|
||||
void setUp() {
|
||||
DebugManager.flags.ExperimentalCopyThroughLock.set(1);
|
||||
DeviceFixture::setUp();
|
||||
|
||||
nonUsmHostPtr = new char[sz];
|
||||
ze_device_mem_alloc_desc_t deviceDesc = {};
|
||||
context->allocDeviceMem(device->toHandle(), &deviceDesc, sz, 1u, &devicePtr);
|
||||
}
|
||||
void tearDown() {
|
||||
delete[] nonUsmHostPtr;
|
||||
context->freeMem(devicePtr);
|
||||
DeviceFixture::tearDown();
|
||||
}
|
||||
|
||||
DebugManagerStateRestore restore;
|
||||
char *nonUsmHostPtr;
|
||||
void *devicePtr;
|
||||
size_t sz = 4 * MemoryConstants::megaByte;
|
||||
};
|
||||
|
||||
using AppendMemoryLockedCopyTest = Test<AppendMemoryLockedCopyFixture>;
|
||||
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrWhenPreferCopyThroughLockedPtrCalledThenReturnTrue, IsXeHpcCore) {
|
||||
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
|
||||
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
NEO::SvmAllocationData *srcAllocData;
|
||||
NEO::SvmAllocationData *dstAllocData;
|
||||
auto srcFound = device->getDriverHandle()->findAllocationDataForRange(nonUsmHostPtr, 1024, &srcAllocData);
|
||||
auto dstFound = device->getDriverHandle()->findAllocationDataForRange(devicePtr, 1024, &dstAllocData);
|
||||
EXPECT_TRUE(cmdList.preferCopyThroughLockedPtr(dstAllocData, dstFound, srcAllocData, srcFound, 1024));
|
||||
}
|
||||
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenIsAllocDeviceMemoryThenReturnCorrectValue, IsXeHpcCore) {
|
||||
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
|
||||
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
NEO::SvmAllocationData *srcAllocData;
|
||||
NEO::SvmAllocationData *dstAllocData;
|
||||
auto srcFound = device->getDriverHandle()->findAllocationDataForRange(nonUsmHostPtr, 1024, &srcAllocData);
|
||||
auto dstFound = device->getDriverHandle()->findAllocationDataForRange(devicePtr, 1024, &dstAllocData);
|
||||
EXPECT_FALSE(cmdList.isAllocUSMDeviceMemory(srcAllocData, srcFound));
|
||||
EXPECT_TRUE(cmdList.isAllocUSMDeviceMemory(dstAllocData, dstFound));
|
||||
}
|
||||
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrAndFlagDisabledWhenPreferCopyThroughLockedPtrCalledThenReturnFalse, IsXeHpcCore) {
|
||||
DebugManager.flags.ExperimentalCopyThroughLock.set(0);
|
||||
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
|
||||
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
NEO::SvmAllocationData *srcAllocData;
|
||||
NEO::SvmAllocationData *dstAllocData;
|
||||
auto srcFound = device->getDriverHandle()->findAllocationDataForRange(nonUsmHostPtr, 1024, &srcAllocData);
|
||||
auto dstFound = device->getDriverHandle()->findAllocationDataForRange(devicePtr, 1024, &dstAllocData);
|
||||
EXPECT_FALSE(cmdList.preferCopyThroughLockedPtr(dstAllocData, dstFound, srcAllocData, srcFound, 1024));
|
||||
}
|
||||
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrWhenCopyH2DThenLockPtr, IsXeHpcCore) {
|
||||
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
|
||||
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
|
||||
|
||||
NEO::SvmAllocationData *allocData;
|
||||
device->getDriverHandle()->findAllocationDataForRange(devicePtr, 1024, &allocData);
|
||||
auto dstAlloc = allocData->gpuAllocations.getGraphicsAllocation(device->getRootDeviceIndex());
|
||||
|
||||
EXPECT_EQ(nullptr, dstAlloc->getLockedPtr());
|
||||
cmdList.appendMemoryCopy(devicePtr, nonUsmHostPtr, 1024, nullptr, 0, nullptr);
|
||||
EXPECT_EQ(1u, reinterpret_cast<MockMemoryManager *>(device->getDriverHandle()->getMemoryManager())->lockResourceCalled);
|
||||
EXPECT_NE(nullptr, dstAlloc->getLockedPtr());
|
||||
}
|
||||
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrWhenCopyD2HThenLockPtr, IsXeHpcCore) {
|
||||
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
|
||||
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
|
||||
|
||||
NEO::SvmAllocationData *allocData;
|
||||
device->getDriverHandle()->findAllocationDataForRange(devicePtr, 1024, &allocData);
|
||||
auto dstAlloc = allocData->gpuAllocations.getGraphicsAllocation(device->getRootDeviceIndex());
|
||||
|
||||
EXPECT_EQ(nullptr, dstAlloc->getLockedPtr());
|
||||
cmdList.appendMemoryCopy(nonUsmHostPtr, devicePtr, 1024, nullptr, 0, nullptr);
|
||||
EXPECT_EQ(1u, reinterpret_cast<MockMemoryManager *>(device->getDriverHandle()->getMemoryManager())->lockResourceCalled);
|
||||
EXPECT_NE(nullptr, dstAlloc->getLockedPtr());
|
||||
}
|
||||
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrWhenCopyH2DAndDstPtrLockedThenDontLockAgain, IsXeHpcCore) {
|
||||
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
|
||||
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
|
||||
|
||||
NEO::SvmAllocationData *allocData;
|
||||
device->getDriverHandle()->findAllocationDataForRange(devicePtr, 1024, &allocData);
|
||||
auto dstAlloc = allocData->gpuAllocations.getGraphicsAllocation(device->getRootDeviceIndex());
|
||||
|
||||
device->getDriverHandle()->getMemoryManager()->lockResource(dstAlloc);
|
||||
|
||||
EXPECT_EQ(1u, reinterpret_cast<MockMemoryManager *>(device->getDriverHandle()->getMemoryManager())->lockResourceCalled);
|
||||
cmdList.appendMemoryCopy(devicePtr, nonUsmHostPtr, 1024, nullptr, 0, nullptr);
|
||||
EXPECT_EQ(1u, reinterpret_cast<MockMemoryManager *>(device->getDriverHandle()->getMemoryManager())->lockResourceCalled);
|
||||
}
|
||||
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrWhenCopyH2DThenUseMemcpyAndReturnSuccess, IsXeHpcCore) {
|
||||
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
|
||||
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
|
||||
|
||||
memset(nonUsmHostPtr, 1, 1024);
|
||||
|
||||
auto res = cmdList.appendMemoryCopy(devicePtr, nonUsmHostPtr, 1024, nullptr, 0, nullptr);
|
||||
EXPECT_EQ(res, ZE_RESULT_SUCCESS);
|
||||
|
||||
NEO::SvmAllocationData *allocData;
|
||||
device->getDriverHandle()->findAllocationDataForRange(devicePtr, 1024, &allocData);
|
||||
auto dstAlloc = allocData->gpuAllocations.getGraphicsAllocation(device->getRootDeviceIndex());
|
||||
auto lockedPtr = reinterpret_cast<char *>(dstAlloc->getLockedPtr());
|
||||
EXPECT_EQ(0, memcmp(lockedPtr, nonUsmHostPtr, 1024));
|
||||
}
|
||||
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndSignalEventAndNonUsmHostPtrWhenCopyH2DThenSignalEvent, IsXeHpcCore) {
|
||||
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
|
||||
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
|
||||
|
||||
ze_event_pool_desc_t eventPoolDesc = {};
|
||||
eventPoolDesc.count = 1;
|
||||
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
|
||||
|
||||
ze_event_desc_t eventDesc = {};
|
||||
eventDesc.index = 0;
|
||||
ze_result_t returnValue = ZE_RESULT_SUCCESS;
|
||||
auto eventPool = std::unique_ptr<EventPool>(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, returnValue));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||
auto event = std::unique_ptr<Event>(Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
|
||||
|
||||
EXPECT_EQ(event->queryStatus(), ZE_RESULT_NOT_READY);
|
||||
auto res = cmdList.appendMemoryCopy(devicePtr, nonUsmHostPtr, 1024, event->toHandle(), 0, nullptr);
|
||||
EXPECT_EQ(res, ZE_RESULT_SUCCESS);
|
||||
|
||||
EXPECT_EQ(event->queryStatus(), ZE_RESULT_SUCCESS);
|
||||
}
|
||||
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndSignalEventAndCpuMemcpyWhenGpuHangThenDontSynchronizeEvent, IsXeHpcCore) {
|
||||
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
|
||||
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
|
||||
reinterpret_cast<NEO::UltCommandStreamReceiver<FamilyType> *>(cmdList.csr)->callBaseWaitForCompletionWithTimeout = false;
|
||||
reinterpret_cast<NEO::UltCommandStreamReceiver<FamilyType> *>(cmdList.csr)->returnWaitForCompletionWithTimeout = WaitStatus::GpuHang;
|
||||
|
||||
ze_event_pool_desc_t eventPoolDesc = {};
|
||||
eventPoolDesc.count = 1;
|
||||
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
|
||||
|
||||
ze_event_desc_t eventDesc = {};
|
||||
eventDesc.index = 0;
|
||||
ze_result_t returnValue = ZE_RESULT_SUCCESS;
|
||||
auto eventPool = std::unique_ptr<EventPool>(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, returnValue));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||
auto event = std::unique_ptr<Event>(Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
|
||||
|
||||
EXPECT_EQ(event->queryStatus(), ZE_RESULT_NOT_READY);
|
||||
cmdList.appendBarrier(nullptr, 0, nullptr);
|
||||
auto res = cmdList.appendMemoryCopy(devicePtr, nonUsmHostPtr, 1024, event->toHandle(), 0, nullptr);
|
||||
EXPECT_EQ(res, ZE_RESULT_ERROR_DEVICE_LOST);
|
||||
|
||||
EXPECT_EQ(event->queryStatus(), ZE_RESULT_NOT_READY);
|
||||
}
|
||||
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenCpuMemcpyWithoutBarrierThenDontWaitForTagUpdate, IsXeHpcCore) {
|
||||
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
|
||||
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
|
||||
|
||||
auto res = cmdList.appendMemoryCopy(devicePtr, nonUsmHostPtr, 1024, nullptr, 0, nullptr);
|
||||
EXPECT_EQ(res, ZE_RESULT_SUCCESS);
|
||||
|
||||
uint32_t waitForFlushTagUpdateCalled = reinterpret_cast<NEO::UltCommandStreamReceiver<FamilyType> *>(cmdList.csr)->waitForCompletionWithTimeoutTaskCountCalled;
|
||||
EXPECT_EQ(waitForFlushTagUpdateCalled, 0u);
|
||||
}
|
||||
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenCpuMemcpyWithBarrierThenWaitForTagUpdate, IsXeHpcCore) {
|
||||
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
|
||||
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
|
||||
|
||||
cmdList.appendBarrier(nullptr, 0, nullptr);
|
||||
auto res = cmdList.appendMemoryCopy(devicePtr, nonUsmHostPtr, 1024, nullptr, 0, nullptr);
|
||||
EXPECT_EQ(res, ZE_RESULT_SUCCESS);
|
||||
|
||||
uint32_t waitForFlushTagUpdateCalled = reinterpret_cast<NEO::UltCommandStreamReceiver<FamilyType> *>(cmdList.csr)->waitForCompletionWithTimeoutTaskCountCalled;
|
||||
EXPECT_EQ(waitForFlushTagUpdateCalled, 1u);
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
class MockAppendMemoryLockedCopyTestImmediateCmdList : public MockCommandListImmediateHw<gfxCoreFamily> {
|
||||
public:
|
||||
MockAppendMemoryLockedCopyTestImmediateCmdList() : MockCommandListImmediateHw<gfxCoreFamily>() {}
|
||||
ze_result_t appendMemoryCopyKernelWithGA(void *dstPtr, NEO::GraphicsAllocation *dstPtrAlloc,
|
||||
uint64_t dstOffset, void *srcPtr,
|
||||
NEO::GraphicsAllocation *srcPtrAlloc,
|
||||
uint64_t srcOffset, uint64_t size,
|
||||
uint64_t elementSize, Builtin builtin,
|
||||
Event *signalEvent,
|
||||
bool isStateless) override {
|
||||
appendMemoryCopyKernelWithGACalled++;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
ze_result_t appendBarrier(ze_event_handle_t hSignalEvent,
|
||||
uint32_t numWaitEvents,
|
||||
ze_event_handle_t *phWaitEvents) override {
|
||||
appendBarrierCalled++;
|
||||
return MockCommandListImmediateHw<gfxCoreFamily>::appendBarrier(hSignalEvent, numWaitEvents, phWaitEvents);
|
||||
}
|
||||
|
||||
uint32_t appendBarrierCalled = 0;
|
||||
uint32_t appendMemoryCopyKernelWithGACalled = 0;
|
||||
};
|
||||
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndUsmSrcHostPtrWhenCopyH2DThenUseGpuMemcpy, IsXeHpcCore) {
|
||||
MockAppendMemoryLockedCopyTestImmediateCmdList<gfxCoreFamily> cmdList;
|
||||
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
void *usmSrcPtr;
|
||||
ze_host_mem_alloc_desc_t hostDesc = {};
|
||||
context->allocHostMem(&hostDesc, 1024, 1u, &usmSrcPtr);
|
||||
|
||||
cmdList.appendMemoryCopy(devicePtr, usmSrcPtr, 1024, nullptr, 0, nullptr);
|
||||
EXPECT_GE(cmdList.appendMemoryCopyKernelWithGACalled, 1u);
|
||||
context->freeMem(usmSrcPtr);
|
||||
}
|
||||
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndUsmDstHostPtrWhenCopyThenUseGpuMemcpy, IsXeHpcCore) {
|
||||
MockAppendMemoryLockedCopyTestImmediateCmdList<gfxCoreFamily> cmdList;
|
||||
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
void *usmHostDstPtr;
|
||||
ze_host_mem_alloc_desc_t hostDesc = {};
|
||||
context->allocHostMem(&hostDesc, 1024, 1u, &usmHostDstPtr);
|
||||
|
||||
cmdList.appendMemoryCopy(usmHostDstPtr, nonUsmHostPtr, 1024, nullptr, 0, nullptr);
|
||||
EXPECT_GE(cmdList.appendMemoryCopyKernelWithGACalled, 1u);
|
||||
context->freeMem(usmHostDstPtr);
|
||||
}
|
||||
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndUsmSrcHostPtrWhenCopyThenUseGpuMemcpy, IsXeHpcCore) {
|
||||
MockAppendMemoryLockedCopyTestImmediateCmdList<gfxCoreFamily> cmdList;
|
||||
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
void *usmHostSrcPtr;
|
||||
ze_host_mem_alloc_desc_t hostDesc = {};
|
||||
context->allocHostMem(&hostDesc, 1024, 1u, &usmHostSrcPtr);
|
||||
|
||||
cmdList.appendMemoryCopy(nonUsmHostPtr, usmHostSrcPtr, 1024, nullptr, 0, nullptr);
|
||||
EXPECT_GE(cmdList.appendMemoryCopyKernelWithGACalled, 1u);
|
||||
context->freeMem(usmHostSrcPtr);
|
||||
}
|
||||
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmSrcHostPtrWhenSizeTooLargeThenUseGpuMemcpy, IsXeHpcCore) {
|
||||
MockAppendMemoryLockedCopyTestImmediateCmdList<gfxCoreFamily> cmdList;
|
||||
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
cmdList.appendMemoryCopy(devicePtr, nonUsmHostPtr, 3 * MemoryConstants::megaByte, nullptr, 0, nullptr);
|
||||
EXPECT_GE(cmdList.appendMemoryCopyKernelWithGACalled, 1u);
|
||||
}
|
||||
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmDstHostPtrWhenSizeTooLargeThenUseGpuMemcpy, IsXeHpcCore) {
|
||||
MockAppendMemoryLockedCopyTestImmediateCmdList<gfxCoreFamily> cmdList;
|
||||
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
cmdList.appendMemoryCopy(nonUsmHostPtr, devicePtr, 2 * MemoryConstants::kiloByte, nullptr, 0, nullptr);
|
||||
EXPECT_GE(cmdList.appendMemoryCopyKernelWithGACalled, 1u);
|
||||
}
|
||||
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndD2HCopyWhenSizeTooLargeButFlagSetThenUseCpuMemcpy, IsXeHpcCore) {
|
||||
DebugManager.flags.ExperimentalD2HCpuCopyThreshold.set(2048);
|
||||
MockAppendMemoryLockedCopyTestImmediateCmdList<gfxCoreFamily> cmdList;
|
||||
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
|
||||
|
||||
cmdList.appendMemoryCopy(nonUsmHostPtr, devicePtr, 2 * MemoryConstants::kiloByte, nullptr, 0, nullptr);
|
||||
EXPECT_EQ(cmdList.appendMemoryCopyKernelWithGACalled, 0u);
|
||||
}
|
||||
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndH2DCopyWhenSizeTooLargeButFlagSetThenUseCpuMemcpy, IsXeHpcCore) {
|
||||
DebugManager.flags.ExperimentalH2DCpuCopyThreshold.set(3 * MemoryConstants::megaByte);
|
||||
MockAppendMemoryLockedCopyTestImmediateCmdList<gfxCoreFamily> cmdList;
|
||||
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
|
||||
cmdList.appendMemoryCopy(devicePtr, nonUsmHostPtr, 3 * MemoryConstants::megaByte, nullptr, 0, nullptr);
|
||||
EXPECT_EQ(cmdList.appendMemoryCopyKernelWithGACalled, 0u);
|
||||
}
|
||||
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndCpuMemcpyWithDependencyThenAppendBarrierCalled, IsXeHpcCore) {
|
||||
MockAppendMemoryLockedCopyTestImmediateCmdList<gfxCoreFamily> cmdList;
|
||||
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
|
||||
|
||||
ze_event_pool_desc_t eventPoolDesc = {};
|
||||
eventPoolDesc.count = 1;
|
||||
|
||||
ze_event_desc_t eventDesc = {};
|
||||
eventDesc.index = 0;
|
||||
ze_result_t returnValue = ZE_RESULT_SUCCESS;
|
||||
auto eventPool = std::unique_ptr<EventPool>(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, returnValue));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||
auto event = std::unique_ptr<Event>(Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
|
||||
auto phEvent = event->toHandle();
|
||||
cmdList.appendMemoryCopy(devicePtr, nonUsmHostPtr, 2 * MemoryConstants::kiloByte, nullptr, 1, &phEvent);
|
||||
EXPECT_EQ(cmdList.appendBarrierCalled, 1u);
|
||||
}
|
||||
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndCpuMemcpyWithoutDependencyThenAppendBarrierNotCalled, IsXeHpcCore) {
|
||||
MockAppendMemoryLockedCopyTestImmediateCmdList<gfxCoreFamily> cmdList;
|
||||
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
|
||||
cmdList.appendMemoryCopy(devicePtr, nonUsmHostPtr, 2 * MemoryConstants::kiloByte, nullptr, 0, nullptr);
|
||||
EXPECT_EQ(cmdList.appendBarrierCalled, 0u);
|
||||
}
|
||||
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndTimestampFlagSetWhenCpuMemcpyThenSetCorrectGpuTimestamps, IsXeHpcCore) {
|
||||
MockAppendMemoryLockedCopyTestImmediateCmdList<gfxCoreFamily> cmdList;
|
||||
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
|
||||
neoDevice->setOSTime(new NEO::MockOSTimeWithConstTimestamp());
|
||||
|
||||
ze_event_pool_desc_t eventPoolDesc = {};
|
||||
eventPoolDesc.count = 1;
|
||||
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
|
||||
|
||||
ze_event_desc_t eventDesc = {};
|
||||
eventDesc.index = 0;
|
||||
|
||||
ze_result_t returnValue = ZE_RESULT_SUCCESS;
|
||||
auto eventPool = std::unique_ptr<EventPool>(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, returnValue));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||
|
||||
auto event = std::unique_ptr<Event>(Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
|
||||
auto phEvent = event->toHandle();
|
||||
cmdList.appendMemoryCopy(devicePtr, nonUsmHostPtr, 2 * MemoryConstants::kiloByte, phEvent, 0, nullptr);
|
||||
ze_kernel_timestamp_result_t resultTimestamp = {};
|
||||
auto result = event->queryKernelTimestamp(&resultTimestamp);
|
||||
EXPECT_EQ(result, ZE_RESULT_SUCCESS);
|
||||
|
||||
EXPECT_EQ(resultTimestamp.context.kernelStart, NEO::MockDeviceTimeWithConstTimestamp::GPU_TIMESTAMP);
|
||||
EXPECT_EQ(resultTimestamp.global.kernelStart, NEO::MockDeviceTimeWithConstTimestamp::GPU_TIMESTAMP);
|
||||
EXPECT_EQ(resultTimestamp.context.kernelEnd, NEO::MockDeviceTimeWithConstTimestamp::GPU_TIMESTAMP);
|
||||
EXPECT_EQ(resultTimestamp.global.kernelEnd, NEO::MockDeviceTimeWithConstTimestamp::GPU_TIMESTAMP);
|
||||
}
|
||||
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndTimestampFlagNotSetWhenCpuMemcpyThenDontSetGpuTimestamps, IsXeHpcCore) {
|
||||
MockAppendMemoryLockedCopyTestImmediateCmdList<gfxCoreFamily> cmdList;
|
||||
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
|
||||
neoDevice->setOSTime(new NEO::MockOSTimeWithConstTimestamp());
|
||||
|
||||
ze_event_pool_desc_t eventPoolDesc = {};
|
||||
eventPoolDesc.count = 1;
|
||||
|
||||
ze_event_desc_t eventDesc = {};
|
||||
eventDesc.index = 0;
|
||||
ze_result_t returnValue = ZE_RESULT_SUCCESS;
|
||||
auto eventPool = std::unique_ptr<EventPool>(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, returnValue));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||
|
||||
auto event = std::unique_ptr<Event>(Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
|
||||
auto phEvent = event->toHandle();
|
||||
cmdList.appendMemoryCopy(devicePtr, nonUsmHostPtr, 2 * MemoryConstants::kiloByte, phEvent, 0, nullptr);
|
||||
ze_kernel_timestamp_result_t resultTimestamp = {};
|
||||
auto result = event->queryKernelTimestamp(&resultTimestamp);
|
||||
EXPECT_EQ(result, ZE_RESULT_SUCCESS);
|
||||
|
||||
EXPECT_NE(resultTimestamp.context.kernelEnd, NEO::MockDeviceTimeWithConstTimestamp::GPU_TIMESTAMP);
|
||||
}
|
||||
|
||||
} // namespace ult
|
||||
} // namespace L0
|
||||
|
|
|
@ -424,9 +424,12 @@ DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalSetWalkerPartitionCount, 0, "Experim
|
|||
DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalSetWalkerPartitionType, -1, "Experimental implementation: Set COMPUTE_WALKER Partition Type. Valid values for types from 1 to 3")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalEnableCustomLocalMemoryAlignment, 0, "Align local memory allocations to a given value. Works only with allocations at least as big as the value. 0: no effect, 2097152: 2 megabytes, 1073741824: 1 gigabyte")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalEnableDeviceAllocationCache, -1, "Experimentally enable allocation cache.")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalH2DCpuCopyThreshold, -1, "Override default treshold (in bytes) for H2D CPU copy.")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalD2HCpuCopyThreshold, -1, "Override default treshold (in bytes) for D2H CPU copy.")
|
||||
DECLARE_DEBUG_VARIABLE(bool, ExperimentalEnableSourceLevelDebugger, false, "Experimentally enable source level debugger.")
|
||||
DECLARE_DEBUG_VARIABLE(bool, ExperimentalEnableL0DebuggerForOpenCL, false, "Experimentally enable debugging OCL with L0 Debug API.")
|
||||
DECLARE_DEBUG_VARIABLE(bool, ExperimentalEnableTileAttach, false, "Experimentally enable attaching to tiles (subdevices).")
|
||||
DECLARE_DEBUG_VARIABLE(bool, ExperimentalCopyThroughLock, false, "Experimentally copy memory through locked ptr.")
|
||||
|
||||
/*DRIVER TOGGLES*/
|
||||
DECLARE_DEBUG_VARIABLE(bool, UseMaxSimdSizeToDeduceMaxWorkgroupSize, false, "With this flag on, max workgroup size is deduced using SIMD32 instead of SIMD8, this causes the max wkg size to be 4 times bigger")
|
||||
|
|
|
@ -158,6 +158,7 @@ class HwHelper {
|
|||
virtual bool isPlatformFlushTaskEnabled(const NEO::HardwareInfo &hwInfo) const = 0;
|
||||
virtual bool isPatIndexFallbackWaRequired() const = 0;
|
||||
virtual uint32_t getMinimalScratchSpaceSize() const = 0;
|
||||
virtual bool copyThroughLockedPtrEnabled() const = 0;
|
||||
|
||||
protected:
|
||||
HwHelper() = default;
|
||||
|
@ -399,6 +400,7 @@ class HwHelperHw : public HwHelper {
|
|||
bool isPlatformFlushTaskEnabled(const NEO::HardwareInfo &hwInfo) const override;
|
||||
bool isPatIndexFallbackWaRequired() const override;
|
||||
uint32_t getMinimalScratchSpaceSize() const override;
|
||||
bool copyThroughLockedPtrEnabled() const override;
|
||||
|
||||
protected:
|
||||
static const AuxTranslationMode defaultAuxTranslationMode;
|
||||
|
|
|
@ -718,4 +718,9 @@ bool HwHelperHw<GfxFamily>::isPatIndexFallbackWaRequired() const {
|
|||
return false;
|
||||
}
|
||||
|
||||
template <typename gfxProduct>
|
||||
bool HwHelperHw<gfxProduct>::copyThroughLockedPtrEnabled() const {
|
||||
return false;
|
||||
}
|
||||
|
||||
} // namespace NEO
|
||||
|
|
|
@ -442,6 +442,11 @@ bool HwHelperHw<Family>::isPatIndexFallbackWaRequired() const {
|
|||
return true;
|
||||
}
|
||||
|
||||
template <>
|
||||
bool HwHelperHw<Family>::copyThroughLockedPtrEnabled() const {
|
||||
return DebugManager.flags.ExperimentalCopyThroughLock.get();
|
||||
}
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
#include "shared/source/helpers/hw_helper_pvc_and_later.inl"
|
||||
|
|
|
@ -458,4 +458,7 @@ ExperimentalEnableTileAttach = 0
|
|||
DirectSubmissionDisablePrefetcher = -1
|
||||
ForceDefaultGrfCompilationMode = 0
|
||||
ForceLargeGrfCompilationMode = 0
|
||||
ForceStatelessMocsEncryptionBit = -1
|
||||
ForceStatelessMocsEncryptionBit = -1
|
||||
ExperimentalCopyThroughLock = 0
|
||||
ExperimentalH2DCpuCopyThreshold = -1
|
||||
ExperimentalD2HCpuCopyThreshold = -1
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
*/
|
||||
|
||||
#include "shared/source/helpers/compiler_hw_info_config.h"
|
||||
#include "shared/source/helpers/hw_helper.h"
|
||||
#include "shared/source/os_interface/hw_info_config.h"
|
||||
#include "shared/test/common/helpers/debug_manager_state_restore.h"
|
||||
#include "shared/test/common/test_macros/test.h"
|
||||
|
@ -196,3 +197,8 @@ HWTEST2_F(HwInfoConfigTest, givenHwInfoConfigWhenIsPlatformQueryNotSupportedThen
|
|||
const auto &hwInfoConfig = *HwInfoConfig::get(productFamily);
|
||||
EXPECT_FALSE(hwInfoConfig.isPlatformQuerySupported());
|
||||
}
|
||||
|
||||
HWTEST_F(HwInfoConfigTest, givenHwHelperWhenCallCopyThroughLockedPtrEnabledThenReturnFalse) {
|
||||
HwHelper &hwHelper = HwHelper::get(defaultHwInfo->platform.eRenderCoreFamily);
|
||||
EXPECT_FALSE(hwHelper.copyThroughLockedPtrEnabled());
|
||||
}
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
*/
|
||||
|
||||
#include "shared/source/helpers/hw_helper.h"
|
||||
#include "shared/test/common/helpers/debug_manager_state_restore.h"
|
||||
#include "shared/test/common/helpers/default_hw_info.h"
|
||||
#include "shared/test/common/helpers/hw_helper_tests.h"
|
||||
#include "shared/test/common/test_macros/header/per_product_test_definitions.h"
|
||||
|
@ -73,3 +74,14 @@ XE_HPC_CORETEST_F(HwHelperXeHpcCoreTest, givenXeHPCPlatformWhenCheckAssignEngine
|
|||
auto &hwHelper = HwHelperHw<FamilyType>::get();
|
||||
EXPECT_EQ(hwHelper.isAssignEngineRoundRobinSupported(hwInfo), HwInfoConfig::get(hwInfo.platform.eProductFamily)->isAssignEngineRoundRobinSupported());
|
||||
}
|
||||
|
||||
XE_HPC_CORETEST_F(HwHelperTest, givenHwHelperWithFlagSetWhenCallCopyThroughLockedPtrEnabledThenReturnFalse) {
|
||||
DebugManagerStateRestore restore;
|
||||
auto &hwHelper = HwHelperHw<FamilyType>::get();
|
||||
|
||||
DebugManager.flags.ExperimentalCopyThroughLock.set(false);
|
||||
EXPECT_FALSE(hwHelper.copyThroughLockedPtrEnabled());
|
||||
|
||||
DebugManager.flags.ExperimentalCopyThroughLock.set(true);
|
||||
EXPECT_TRUE(hwHelper.copyThroughLockedPtrEnabled());
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue