[L0][XE_HPC]Perform memcpy on CPU for non-usm ptrs

Related-To: NEO-7237

If size is small enough, it is more efficient to
perform copy through locked ptr on CPU.
This change also introduces experimental flag to
enable this.

Signed-off-by: Szymon Morek <szymon.morek@intel.com>
This commit is contained in:
Szymon Morek 2022-09-20 09:32:33 +00:00 committed by Compute-Runtime-Automation
parent 6c1504a0f4
commit ec04de61a7
16 changed files with 575 additions and 11 deletions

View File

@ -323,6 +323,9 @@ struct CommandList : _ze_command_list_handle_t {
bool multiReturnPointCommandList = false;
bool systolicModeSupport = false;
bool pipelineSelectStateTracking = false;
std::atomic<uint32_t> barrierCounter{0u};
uint32_t latestFlushedBarrierCounter = 0u;
};
using CommandListAllocatorFn = CommandList *(*)(uint32_t);

View File

@ -2492,7 +2492,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendBarrier(ze_event_handle_
}
appendSignalEventPostWalker(signalEvent, workloadPartition);
this->barrierCounter++;
return ZE_RESULT_SUCCESS;
}

View File

@ -9,6 +9,10 @@
#include "level_zero/core/source/cmdlist/cmdlist_hw.h"
namespace NEO {
struct SvmAllocationData;
}
namespace L0 {
struct EventPool;
@ -123,6 +127,11 @@ struct CommandListCoreFamilyImmediate : public CommandListCoreFamily<gfxCoreFami
void createLogicalStateHelper() override {}
NEO::LogicalStateHelper *getLogicalStateHelper() const override;
bool preferCopyThroughLockedPtr(NEO::SvmAllocationData *dstAlloc, bool dstFound, NEO::SvmAllocationData *srcAlloc, bool srcFound, size_t size);
bool isAllocUSMDeviceMemory(NEO::SvmAllocationData *alloc, bool allocFound);
ze_result_t performCpuMemcpy(void *dstptr, const void *srcptr, size_t size, bool isDstDeviceMemory, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents);
void *obtainLockedPtrFromDevice(void *ptr, size_t size);
};
template <PRODUCT_FAMILY gfxProductFamily>

View File

@ -13,6 +13,7 @@
#include "shared/source/helpers/logical_state_helper.h"
#include "shared/source/memory_manager/internal_allocation_storage.h"
#include "shared/source/memory_manager/prefetch_manager.h"
#include "shared/source/memory_manager/unified_memory_manager.h"
#include "level_zero/core/source/cmdlist/cmdlist_hw_immediate.h"
#include "level_zero/core/source/device/bcs_split.h"
@ -227,6 +228,14 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopy(
ze_result_t ret;
NEO::SvmAllocationData *srcAllocData = nullptr;
NEO::SvmAllocationData *dstAllocData = nullptr;
bool srcAllocFound = this->device->getDriverHandle()->findAllocationDataForRange(const_cast<void *>(srcptr), size, &srcAllocData);
bool dstAllocFound = this->device->getDriverHandle()->findAllocationDataForRange(dstptr, size, &dstAllocData);
if (preferCopyThroughLockedPtr(dstAllocData, dstAllocFound, srcAllocData, srcAllocFound, size)) {
return performCpuMemcpy(dstptr, srcptr, size, dstAllocFound, hSignalEvent, numWaitEvents, phWaitEvents);
}
if (this->isAppendSplitNeeded(dstptr, srcptr, size)) {
ret = static_cast<DeviceImp *>(this->device)->bcsSplit.appendSplitCall<gfxCoreFamily, void *, const void *>(this, dstptr, srcptr, size, hSignalEvent, [&](void *dstptrParam, const void *srcptrParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) {
return CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(dstptrParam, srcptrParam, sizeParam, hSignalEventParam, numWaitEvents, phWaitEvents);
@ -461,4 +470,91 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::flushImmediate(ze_res
return inputRet;
}
template <GFXCORE_FAMILY gfxCoreFamily>
bool CommandListCoreFamilyImmediate<gfxCoreFamily>::preferCopyThroughLockedPtr(NEO::SvmAllocationData *dstAlloc, bool dstFound, NEO::SvmAllocationData *srcAlloc, bool srcFound, size_t size) {
size_t h2DThreshold = 2 * MemoryConstants::megaByte;
size_t d2HThreshold = 1 * MemoryConstants::kiloByte;
if (NEO::DebugManager.flags.ExperimentalH2DCpuCopyThreshold.get() != -1) {
h2DThreshold = NEO::DebugManager.flags.ExperimentalH2DCpuCopyThreshold.get();
}
if (NEO::DebugManager.flags.ExperimentalD2HCpuCopyThreshold.get() != -1) {
d2HThreshold = NEO::DebugManager.flags.ExperimentalD2HCpuCopyThreshold.get();
}
if (NEO::HwHelper::get(this->device->getHwInfo().platform.eRenderCoreFamily).copyThroughLockedPtrEnabled()) {
return (!srcFound && isAllocUSMDeviceMemory(dstAlloc, dstFound) && size <= h2DThreshold) ||
(!dstFound && isAllocUSMDeviceMemory(srcAlloc, srcFound) && size <= d2HThreshold);
}
return false;
}
template <GFXCORE_FAMILY gfxCoreFamily>
bool CommandListCoreFamilyImmediate<gfxCoreFamily>::isAllocUSMDeviceMemory(NEO::SvmAllocationData *alloc, bool allocFound) {
return allocFound && (alloc->memoryType == InternalMemoryType::DEVICE_UNIFIED_MEMORY);
}
template <GFXCORE_FAMILY gfxCoreFamily>
ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::performCpuMemcpy(void *dstptr, const void *srcptr, size_t size, bool isDstDeviceMemory, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) {
bool needsBarrier = (numWaitEvents > 0);
if (needsBarrier) {
this->appendBarrier(nullptr, numWaitEvents, phWaitEvents);
}
bool needsFlushTagUpdate = this->latestFlushedBarrierCounter < this->barrierCounter;
if (needsFlushTagUpdate) {
this->csr->flushTagUpdate();
}
Event *signalEvent = nullptr;
if (hSignalEvent) {
signalEvent = Event::fromHandle(hSignalEvent);
}
const void *cpuMemcpySrcPtr = nullptr;
void *cpuMemcpyDstPtr = nullptr;
if (isDstDeviceMemory) {
cpuMemcpySrcPtr = srcptr;
cpuMemcpyDstPtr = obtainLockedPtrFromDevice(dstptr, size);
} else {
cpuMemcpySrcPtr = obtainLockedPtrFromDevice(const_cast<void *>(srcptr), size);
cpuMemcpyDstPtr = dstptr;
}
if (needsFlushTagUpdate) {
auto timeoutMicroseconds = NEO::TimeoutControls::maxTimeout;
const auto waitStatus = this->csr->waitForCompletionWithTimeout(NEO::WaitParams{false, false, timeoutMicroseconds}, this->csr->peekTaskCount());
if (waitStatus == NEO::WaitStatus::GpuHang) {
return ZE_RESULT_ERROR_DEVICE_LOST;
}
this->latestFlushedBarrierCounter = this->barrierCounter;
}
if (signalEvent) {
signalEvent->setGpuStartTimestamp();
}
memcpy_s(cpuMemcpyDstPtr, size, cpuMemcpySrcPtr, size);
if (signalEvent) {
signalEvent->setGpuEndTimestamp();
signalEvent->hostSignal();
}
return ZE_RESULT_SUCCESS;
}
template <GFXCORE_FAMILY gfxCoreFamily>
void *CommandListCoreFamilyImmediate<gfxCoreFamily>::obtainLockedPtrFromDevice(void *ptr, size_t size) {
NEO::SvmAllocationData *allocData = nullptr;
auto allocFound = this->device->getDriverHandle()->findAllocationDataForRange(ptr, size, &allocData);
UNRECOVERABLE_IF(!allocFound);
auto alloc = allocData->gpuAllocations.getGraphicsAllocation(this->device->getRootDeviceIndex());
if (!alloc->isLocked()) {
this->device->getDriverHandle()->getMemoryManager()->lockResource(alloc);
}
auto gpuAddress = allocData->gpuAllocations.getGraphicsAllocation(this->device->getRootDeviceIndex())->getGpuAddress();
auto offset = ptrDiff(ptr, gpuAddress);
return ptrOffset(alloc->getLockedPtr(), offset);
}
} // namespace L0

View File

@ -65,6 +65,8 @@ struct Event : _ze_event_handle_t {
void *getHostAddress() { return hostAddress; }
virtual void setPacketsInUse(uint32_t value) = 0;
uint32_t getCurrKernelDataIndex() const { return kernelCount - 1; }
virtual void setGpuStartTimestamp() = 0;
virtual void setGpuEndTimestamp() = 0;
size_t getContextStartOffset() const {
return contextStartOffset;
@ -143,6 +145,10 @@ struct Event : _ze_event_handle_t {
size_t singlePacketSize = 0u;
size_t eventPoolOffset = 0u;
size_t cpuStartTimestamp = 0u;
size_t gpuStartTimestamp = 0u;
size_t gpuEndTimestamp = 0u;
uint32_t kernelCount = 1u;
bool isTimestampEvent = false;
@ -195,6 +201,8 @@ struct EventImp : public Event {
uint32_t getPacketsInUse() override;
uint32_t getPacketsUsedInLastKernel() override;
void setPacketsInUse(uint32_t value) override;
void setGpuStartTimestamp() override;
void setGpuEndTimestamp() override;
std::unique_ptr<KernelEventCompletionData<TagSizeT>[]> kernelEventCompletionData;

View File

@ -7,6 +7,7 @@
#include "shared/source/debug_settings/debug_settings_manager.h"
#include "shared/source/memory_manager/internal_allocation_storage.h"
#include "shared/source/os_interface/os_time.h"
#include "level_zero/core/source/event/event.h"
#include "level_zero/core/source/hw_helpers/l0_hw_helper.h"
@ -167,18 +168,24 @@ template <typename TagSizeT>
ze_result_t EventImp<TagSizeT>::hostEventSetValueTimestamps(TagSizeT eventVal) {
auto baseAddr = castToUint64(hostAddress);
auto eventTsSetFunc = [&eventVal](auto tsAddr) {
auto eventTsSetFunc = [](auto tsAddr, TagSizeT value) {
auto tsptr = reinterpret_cast<void *>(tsAddr);
memcpy_s(tsptr, sizeof(TagSizeT), static_cast<void *>(&eventVal), sizeof(TagSizeT));
memcpy_s(tsptr, sizeof(TagSizeT), static_cast<void *>(&value), sizeof(TagSizeT));
};
TagSizeT timestampStart = eventVal;
TagSizeT timestampEnd = eventVal;
if (eventVal == Event::STATE_SIGNALED) {
timestampStart = static_cast<TagSizeT>(this->gpuStartTimestamp);
timestampEnd = static_cast<TagSizeT>(this->gpuEndTimestamp);
}
for (uint32_t i = 0; i < kernelCount; i++) {
uint32_t packetsToSet = kernelEventCompletionData[i].getPacketsUsed();
for (uint32_t j = 0; j < packetsToSet; j++) {
eventTsSetFunc(baseAddr + contextStartOffset);
eventTsSetFunc(baseAddr + globalStartOffset);
eventTsSetFunc(baseAddr + contextEndOffset);
eventTsSetFunc(baseAddr + globalEndOffset);
eventTsSetFunc(baseAddr + contextStartOffset, timestampStart);
eventTsSetFunc(baseAddr + globalStartOffset, timestampStart);
eventTsSetFunc(baseAddr + contextEndOffset, timestampEnd);
eventTsSetFunc(baseAddr + globalEndOffset, timestampEnd);
baseAddr += singlePacketSize;
}
}
@ -316,7 +323,6 @@ ze_result_t EventImp<TagSizeT>::queryKernelTimestamp(ze_kernel_timestamp_result_
eventTsSetFunc(globalEndTS, result.context.kernelEnd);
eventTsSetFunc(globalEndTS, result.global.kernelEnd);
}
return ZE_RESULT_SUCCESS;
}
@ -379,6 +385,9 @@ void EventImp<TagSizeT>::resetPackets() {
kernelEventCompletionData[i].setPacketsUsed(1);
}
kernelCount = 1;
cpuStartTimestamp = 0;
gpuStartTimestamp = 0;
gpuEndTimestamp = 0;
}
template <typename TagSizeT>
@ -410,4 +419,21 @@ uint64_t EventImp<TagSizeT>::getPacketAddress(Device *device) {
return address;
}
template <typename TagSizeT>
void EventImp<TagSizeT>::setGpuStartTimestamp() {
if (isEventTimestampFlagSet()) {
this->device->getGlobalTimestamps(&cpuStartTimestamp, &gpuStartTimestamp);
cpuStartTimestamp = cpuStartTimestamp / this->device->getNEODevice()->getDeviceInfo().outProfilingTimerResolution;
}
}
template <typename TagSizeT>
void EventImp<TagSizeT>::setGpuEndTimestamp() {
if (isEventTimestampFlagSet()) {
auto resolution = this->device->getNEODevice()->getDeviceInfo().outProfilingTimerResolution;
auto cpuEndTimestamp = this->device->getNEODevice()->getOSTime()->getCpuRawTimestamp() / resolution;
this->gpuEndTimestamp = gpuStartTimestamp + (cpuEndTimestamp - cpuStartTimestamp);
}
}
} // namespace L0

View File

@ -67,8 +67,11 @@ struct Mock<EventPool> : public EventPool {
class MockEvent : public ::L0::Event {
public:
using ::L0::Event::gpuEndTimestamp;
using ::L0::Event::gpuStartTimestamp;
using ::L0::Event::isCompleted;
using ::L0::Event::l3FlushAppliedOnKernel;
MockEvent() {
mockAllocation.reset(new NEO::MockGraphicsAllocation(0,
NEO::AllocationType::INTERNAL_HOST_MEMORY,
@ -119,7 +122,8 @@ class MockEvent : public ::L0::Event {
void resetPackets() override {}
void setPacketsInUse(uint32_t value) override {}
uint64_t getPacketAddress(L0::Device *) override { return 0; }
void setGpuStartTimestamp() override {}
void setGpuEndTimestamp() override {}
std::unique_ptr<NEO::GraphicsAllocation> mockAllocation;
};

View File

@ -2202,6 +2202,9 @@ HWTEST_F(EventTests,
}
struct MockEventCompletion : public EventImp<uint32_t> {
using EventImp<uint32_t>::gpuStartTimestamp;
using EventImp<uint32_t>::gpuEndTimestamp;
MockEventCompletion(L0::EventPool *eventPool, int index, L0::Device *device) : EventImp(eventPool, index, device) {
auto neoDevice = device->getNEODevice();
kernelEventCompletionData = std::make_unique<KernelEventCompletionData<uint32_t>[]>(EventPacketsCount::maxKernelSplit);
@ -2260,5 +2263,14 @@ TEST_F(EventTests, WhenQueryingStatusAfterResetThenAccessMemory) {
EXPECT_EQ(event->assignKernelEventCompletionDataCounter, 2u);
}
TEST_F(EventTests, WhenResetEventThenZeroCpuTimestamps) {
auto event = std::make_unique<MockEventCompletion>(eventPool, 1u, device);
event->gpuStartTimestamp = 10u;
event->gpuEndTimestamp = 20u;
EXPECT_EQ(event->reset(), ZE_RESULT_SUCCESS);
EXPECT_EQ(event->gpuStartTimestamp, 0u);
EXPECT_EQ(event->gpuEndTimestamp, 0u);
}
} // namespace ult
} // namespace L0

View File

@ -8,7 +8,9 @@
#include "shared/source/os_interface/hw_info_config.h"
#include "shared/test/common/cmd_parse/gen_cmd_parse.h"
#include "shared/test/common/helpers/debug_manager_state_restore.h"
#include "shared/test/common/libult/ult_command_stream_receiver.h"
#include "shared/test/common/memory_manager/mock_prefetch_manager.h"
#include "shared/test/common/mocks/mock_ostime.h"
#include "shared/test/common/test_macros/hw_test.h"
#include "level_zero/core/source/event/event.h"
@ -882,5 +884,373 @@ HWTEST2_F(CommandListAppendLaunchKernelXeHpcCore,
ASSERT_EQ(result, ZE_RESULT_SUCCESS);
}
struct AppendMemoryLockedCopyFixture : public DeviceFixture {
void setUp() {
DebugManager.flags.ExperimentalCopyThroughLock.set(1);
DeviceFixture::setUp();
nonUsmHostPtr = new char[sz];
ze_device_mem_alloc_desc_t deviceDesc = {};
context->allocDeviceMem(device->toHandle(), &deviceDesc, sz, 1u, &devicePtr);
}
void tearDown() {
delete[] nonUsmHostPtr;
context->freeMem(devicePtr);
DeviceFixture::tearDown();
}
DebugManagerStateRestore restore;
char *nonUsmHostPtr;
void *devicePtr;
size_t sz = 4 * MemoryConstants::megaByte;
};
using AppendMemoryLockedCopyTest = Test<AppendMemoryLockedCopyFixture>;
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrWhenPreferCopyThroughLockedPtrCalledThenReturnTrue, IsXeHpcCore) {
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
NEO::SvmAllocationData *srcAllocData;
NEO::SvmAllocationData *dstAllocData;
auto srcFound = device->getDriverHandle()->findAllocationDataForRange(nonUsmHostPtr, 1024, &srcAllocData);
auto dstFound = device->getDriverHandle()->findAllocationDataForRange(devicePtr, 1024, &dstAllocData);
EXPECT_TRUE(cmdList.preferCopyThroughLockedPtr(dstAllocData, dstFound, srcAllocData, srcFound, 1024));
}
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenIsAllocDeviceMemoryThenReturnCorrectValue, IsXeHpcCore) {
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
NEO::SvmAllocationData *srcAllocData;
NEO::SvmAllocationData *dstAllocData;
auto srcFound = device->getDriverHandle()->findAllocationDataForRange(nonUsmHostPtr, 1024, &srcAllocData);
auto dstFound = device->getDriverHandle()->findAllocationDataForRange(devicePtr, 1024, &dstAllocData);
EXPECT_FALSE(cmdList.isAllocUSMDeviceMemory(srcAllocData, srcFound));
EXPECT_TRUE(cmdList.isAllocUSMDeviceMemory(dstAllocData, dstFound));
}
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrAndFlagDisabledWhenPreferCopyThroughLockedPtrCalledThenReturnFalse, IsXeHpcCore) {
DebugManager.flags.ExperimentalCopyThroughLock.set(0);
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
NEO::SvmAllocationData *srcAllocData;
NEO::SvmAllocationData *dstAllocData;
auto srcFound = device->getDriverHandle()->findAllocationDataForRange(nonUsmHostPtr, 1024, &srcAllocData);
auto dstFound = device->getDriverHandle()->findAllocationDataForRange(devicePtr, 1024, &dstAllocData);
EXPECT_FALSE(cmdList.preferCopyThroughLockedPtr(dstAllocData, dstFound, srcAllocData, srcFound, 1024));
}
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrWhenCopyH2DThenLockPtr, IsXeHpcCore) {
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
NEO::SvmAllocationData *allocData;
device->getDriverHandle()->findAllocationDataForRange(devicePtr, 1024, &allocData);
auto dstAlloc = allocData->gpuAllocations.getGraphicsAllocation(device->getRootDeviceIndex());
EXPECT_EQ(nullptr, dstAlloc->getLockedPtr());
cmdList.appendMemoryCopy(devicePtr, nonUsmHostPtr, 1024, nullptr, 0, nullptr);
EXPECT_EQ(1u, reinterpret_cast<MockMemoryManager *>(device->getDriverHandle()->getMemoryManager())->lockResourceCalled);
EXPECT_NE(nullptr, dstAlloc->getLockedPtr());
}
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrWhenCopyD2HThenLockPtr, IsXeHpcCore) {
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
NEO::SvmAllocationData *allocData;
device->getDriverHandle()->findAllocationDataForRange(devicePtr, 1024, &allocData);
auto dstAlloc = allocData->gpuAllocations.getGraphicsAllocation(device->getRootDeviceIndex());
EXPECT_EQ(nullptr, dstAlloc->getLockedPtr());
cmdList.appendMemoryCopy(nonUsmHostPtr, devicePtr, 1024, nullptr, 0, nullptr);
EXPECT_EQ(1u, reinterpret_cast<MockMemoryManager *>(device->getDriverHandle()->getMemoryManager())->lockResourceCalled);
EXPECT_NE(nullptr, dstAlloc->getLockedPtr());
}
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrWhenCopyH2DAndDstPtrLockedThenDontLockAgain, IsXeHpcCore) {
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
NEO::SvmAllocationData *allocData;
device->getDriverHandle()->findAllocationDataForRange(devicePtr, 1024, &allocData);
auto dstAlloc = allocData->gpuAllocations.getGraphicsAllocation(device->getRootDeviceIndex());
device->getDriverHandle()->getMemoryManager()->lockResource(dstAlloc);
EXPECT_EQ(1u, reinterpret_cast<MockMemoryManager *>(device->getDriverHandle()->getMemoryManager())->lockResourceCalled);
cmdList.appendMemoryCopy(devicePtr, nonUsmHostPtr, 1024, nullptr, 0, nullptr);
EXPECT_EQ(1u, reinterpret_cast<MockMemoryManager *>(device->getDriverHandle()->getMemoryManager())->lockResourceCalled);
}
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrWhenCopyH2DThenUseMemcpyAndReturnSuccess, IsXeHpcCore) {
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
memset(nonUsmHostPtr, 1, 1024);
auto res = cmdList.appendMemoryCopy(devicePtr, nonUsmHostPtr, 1024, nullptr, 0, nullptr);
EXPECT_EQ(res, ZE_RESULT_SUCCESS);
NEO::SvmAllocationData *allocData;
device->getDriverHandle()->findAllocationDataForRange(devicePtr, 1024, &allocData);
auto dstAlloc = allocData->gpuAllocations.getGraphicsAllocation(device->getRootDeviceIndex());
auto lockedPtr = reinterpret_cast<char *>(dstAlloc->getLockedPtr());
EXPECT_EQ(0, memcmp(lockedPtr, nonUsmHostPtr, 1024));
}
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndSignalEventAndNonUsmHostPtrWhenCopyH2DThenSignalEvent, IsXeHpcCore) {
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
ze_event_pool_desc_t eventPoolDesc = {};
eventPoolDesc.count = 1;
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
ze_event_desc_t eventDesc = {};
eventDesc.index = 0;
ze_result_t returnValue = ZE_RESULT_SUCCESS;
auto eventPool = std::unique_ptr<EventPool>(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, returnValue));
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
auto event = std::unique_ptr<Event>(Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
EXPECT_EQ(event->queryStatus(), ZE_RESULT_NOT_READY);
auto res = cmdList.appendMemoryCopy(devicePtr, nonUsmHostPtr, 1024, event->toHandle(), 0, nullptr);
EXPECT_EQ(res, ZE_RESULT_SUCCESS);
EXPECT_EQ(event->queryStatus(), ZE_RESULT_SUCCESS);
}
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndSignalEventAndCpuMemcpyWhenGpuHangThenDontSynchronizeEvent, IsXeHpcCore) {
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
reinterpret_cast<NEO::UltCommandStreamReceiver<FamilyType> *>(cmdList.csr)->callBaseWaitForCompletionWithTimeout = false;
reinterpret_cast<NEO::UltCommandStreamReceiver<FamilyType> *>(cmdList.csr)->returnWaitForCompletionWithTimeout = WaitStatus::GpuHang;
ze_event_pool_desc_t eventPoolDesc = {};
eventPoolDesc.count = 1;
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
ze_event_desc_t eventDesc = {};
eventDesc.index = 0;
ze_result_t returnValue = ZE_RESULT_SUCCESS;
auto eventPool = std::unique_ptr<EventPool>(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, returnValue));
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
auto event = std::unique_ptr<Event>(Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
EXPECT_EQ(event->queryStatus(), ZE_RESULT_NOT_READY);
cmdList.appendBarrier(nullptr, 0, nullptr);
auto res = cmdList.appendMemoryCopy(devicePtr, nonUsmHostPtr, 1024, event->toHandle(), 0, nullptr);
EXPECT_EQ(res, ZE_RESULT_ERROR_DEVICE_LOST);
EXPECT_EQ(event->queryStatus(), ZE_RESULT_NOT_READY);
}
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenCpuMemcpyWithoutBarrierThenDontWaitForTagUpdate, IsXeHpcCore) {
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
auto res = cmdList.appendMemoryCopy(devicePtr, nonUsmHostPtr, 1024, nullptr, 0, nullptr);
EXPECT_EQ(res, ZE_RESULT_SUCCESS);
uint32_t waitForFlushTagUpdateCalled = reinterpret_cast<NEO::UltCommandStreamReceiver<FamilyType> *>(cmdList.csr)->waitForCompletionWithTimeoutTaskCountCalled;
EXPECT_EQ(waitForFlushTagUpdateCalled, 0u);
}
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenCpuMemcpyWithBarrierThenWaitForTagUpdate, IsXeHpcCore) {
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
cmdList.appendBarrier(nullptr, 0, nullptr);
auto res = cmdList.appendMemoryCopy(devicePtr, nonUsmHostPtr, 1024, nullptr, 0, nullptr);
EXPECT_EQ(res, ZE_RESULT_SUCCESS);
uint32_t waitForFlushTagUpdateCalled = reinterpret_cast<NEO::UltCommandStreamReceiver<FamilyType> *>(cmdList.csr)->waitForCompletionWithTimeoutTaskCountCalled;
EXPECT_EQ(waitForFlushTagUpdateCalled, 1u);
}
template <GFXCORE_FAMILY gfxCoreFamily>
class MockAppendMemoryLockedCopyTestImmediateCmdList : public MockCommandListImmediateHw<gfxCoreFamily> {
public:
MockAppendMemoryLockedCopyTestImmediateCmdList() : MockCommandListImmediateHw<gfxCoreFamily>() {}
ze_result_t appendMemoryCopyKernelWithGA(void *dstPtr, NEO::GraphicsAllocation *dstPtrAlloc,
uint64_t dstOffset, void *srcPtr,
NEO::GraphicsAllocation *srcPtrAlloc,
uint64_t srcOffset, uint64_t size,
uint64_t elementSize, Builtin builtin,
Event *signalEvent,
bool isStateless) override {
appendMemoryCopyKernelWithGACalled++;
return ZE_RESULT_SUCCESS;
}
ze_result_t appendBarrier(ze_event_handle_t hSignalEvent,
uint32_t numWaitEvents,
ze_event_handle_t *phWaitEvents) override {
appendBarrierCalled++;
return MockCommandListImmediateHw<gfxCoreFamily>::appendBarrier(hSignalEvent, numWaitEvents, phWaitEvents);
}
uint32_t appendBarrierCalled = 0;
uint32_t appendMemoryCopyKernelWithGACalled = 0;
};
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndUsmSrcHostPtrWhenCopyH2DThenUseGpuMemcpy, IsXeHpcCore) {
MockAppendMemoryLockedCopyTestImmediateCmdList<gfxCoreFamily> cmdList;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
void *usmSrcPtr;
ze_host_mem_alloc_desc_t hostDesc = {};
context->allocHostMem(&hostDesc, 1024, 1u, &usmSrcPtr);
cmdList.appendMemoryCopy(devicePtr, usmSrcPtr, 1024, nullptr, 0, nullptr);
EXPECT_GE(cmdList.appendMemoryCopyKernelWithGACalled, 1u);
context->freeMem(usmSrcPtr);
}
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndUsmDstHostPtrWhenCopyThenUseGpuMemcpy, IsXeHpcCore) {
MockAppendMemoryLockedCopyTestImmediateCmdList<gfxCoreFamily> cmdList;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
void *usmHostDstPtr;
ze_host_mem_alloc_desc_t hostDesc = {};
context->allocHostMem(&hostDesc, 1024, 1u, &usmHostDstPtr);
cmdList.appendMemoryCopy(usmHostDstPtr, nonUsmHostPtr, 1024, nullptr, 0, nullptr);
EXPECT_GE(cmdList.appendMemoryCopyKernelWithGACalled, 1u);
context->freeMem(usmHostDstPtr);
}
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndUsmSrcHostPtrWhenCopyThenUseGpuMemcpy, IsXeHpcCore) {
MockAppendMemoryLockedCopyTestImmediateCmdList<gfxCoreFamily> cmdList;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
void *usmHostSrcPtr;
ze_host_mem_alloc_desc_t hostDesc = {};
context->allocHostMem(&hostDesc, 1024, 1u, &usmHostSrcPtr);
cmdList.appendMemoryCopy(nonUsmHostPtr, usmHostSrcPtr, 1024, nullptr, 0, nullptr);
EXPECT_GE(cmdList.appendMemoryCopyKernelWithGACalled, 1u);
context->freeMem(usmHostSrcPtr);
}
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmSrcHostPtrWhenSizeTooLargeThenUseGpuMemcpy, IsXeHpcCore) {
MockAppendMemoryLockedCopyTestImmediateCmdList<gfxCoreFamily> cmdList;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
cmdList.appendMemoryCopy(devicePtr, nonUsmHostPtr, 3 * MemoryConstants::megaByte, nullptr, 0, nullptr);
EXPECT_GE(cmdList.appendMemoryCopyKernelWithGACalled, 1u);
}
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmDstHostPtrWhenSizeTooLargeThenUseGpuMemcpy, IsXeHpcCore) {
MockAppendMemoryLockedCopyTestImmediateCmdList<gfxCoreFamily> cmdList;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
cmdList.appendMemoryCopy(nonUsmHostPtr, devicePtr, 2 * MemoryConstants::kiloByte, nullptr, 0, nullptr);
EXPECT_GE(cmdList.appendMemoryCopyKernelWithGACalled, 1u);
}
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndD2HCopyWhenSizeTooLargeButFlagSetThenUseCpuMemcpy, IsXeHpcCore) {
DebugManager.flags.ExperimentalD2HCpuCopyThreshold.set(2048);
MockAppendMemoryLockedCopyTestImmediateCmdList<gfxCoreFamily> cmdList;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
cmdList.appendMemoryCopy(nonUsmHostPtr, devicePtr, 2 * MemoryConstants::kiloByte, nullptr, 0, nullptr);
EXPECT_EQ(cmdList.appendMemoryCopyKernelWithGACalled, 0u);
}
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndH2DCopyWhenSizeTooLargeButFlagSetThenUseCpuMemcpy, IsXeHpcCore) {
DebugManager.flags.ExperimentalH2DCpuCopyThreshold.set(3 * MemoryConstants::megaByte);
MockAppendMemoryLockedCopyTestImmediateCmdList<gfxCoreFamily> cmdList;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
cmdList.appendMemoryCopy(devicePtr, nonUsmHostPtr, 3 * MemoryConstants::megaByte, nullptr, 0, nullptr);
EXPECT_EQ(cmdList.appendMemoryCopyKernelWithGACalled, 0u);
}
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndCpuMemcpyWithDependencyThenAppendBarrierCalled, IsXeHpcCore) {
MockAppendMemoryLockedCopyTestImmediateCmdList<gfxCoreFamily> cmdList;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
ze_event_pool_desc_t eventPoolDesc = {};
eventPoolDesc.count = 1;
ze_event_desc_t eventDesc = {};
eventDesc.index = 0;
ze_result_t returnValue = ZE_RESULT_SUCCESS;
auto eventPool = std::unique_ptr<EventPool>(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, returnValue));
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
auto event = std::unique_ptr<Event>(Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
auto phEvent = event->toHandle();
cmdList.appendMemoryCopy(devicePtr, nonUsmHostPtr, 2 * MemoryConstants::kiloByte, nullptr, 1, &phEvent);
EXPECT_EQ(cmdList.appendBarrierCalled, 1u);
}
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndCpuMemcpyWithoutDependencyThenAppendBarrierNotCalled, IsXeHpcCore) {
MockAppendMemoryLockedCopyTestImmediateCmdList<gfxCoreFamily> cmdList;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
cmdList.appendMemoryCopy(devicePtr, nonUsmHostPtr, 2 * MemoryConstants::kiloByte, nullptr, 0, nullptr);
EXPECT_EQ(cmdList.appendBarrierCalled, 0u);
}
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndTimestampFlagSetWhenCpuMemcpyThenSetCorrectGpuTimestamps, IsXeHpcCore) {
MockAppendMemoryLockedCopyTestImmediateCmdList<gfxCoreFamily> cmdList;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
neoDevice->setOSTime(new NEO::MockOSTimeWithConstTimestamp());
ze_event_pool_desc_t eventPoolDesc = {};
eventPoolDesc.count = 1;
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
ze_event_desc_t eventDesc = {};
eventDesc.index = 0;
ze_result_t returnValue = ZE_RESULT_SUCCESS;
auto eventPool = std::unique_ptr<EventPool>(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, returnValue));
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
auto event = std::unique_ptr<Event>(Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
auto phEvent = event->toHandle();
cmdList.appendMemoryCopy(devicePtr, nonUsmHostPtr, 2 * MemoryConstants::kiloByte, phEvent, 0, nullptr);
ze_kernel_timestamp_result_t resultTimestamp = {};
auto result = event->queryKernelTimestamp(&resultTimestamp);
EXPECT_EQ(result, ZE_RESULT_SUCCESS);
EXPECT_EQ(resultTimestamp.context.kernelStart, NEO::MockDeviceTimeWithConstTimestamp::GPU_TIMESTAMP);
EXPECT_EQ(resultTimestamp.global.kernelStart, NEO::MockDeviceTimeWithConstTimestamp::GPU_TIMESTAMP);
EXPECT_EQ(resultTimestamp.context.kernelEnd, NEO::MockDeviceTimeWithConstTimestamp::GPU_TIMESTAMP);
EXPECT_EQ(resultTimestamp.global.kernelEnd, NEO::MockDeviceTimeWithConstTimestamp::GPU_TIMESTAMP);
}
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndTimestampFlagNotSetWhenCpuMemcpyThenDontSetGpuTimestamps, IsXeHpcCore) {
MockAppendMemoryLockedCopyTestImmediateCmdList<gfxCoreFamily> cmdList;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
neoDevice->setOSTime(new NEO::MockOSTimeWithConstTimestamp());
ze_event_pool_desc_t eventPoolDesc = {};
eventPoolDesc.count = 1;
ze_event_desc_t eventDesc = {};
eventDesc.index = 0;
ze_result_t returnValue = ZE_RESULT_SUCCESS;
auto eventPool = std::unique_ptr<EventPool>(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, returnValue));
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
auto event = std::unique_ptr<Event>(Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
auto phEvent = event->toHandle();
cmdList.appendMemoryCopy(devicePtr, nonUsmHostPtr, 2 * MemoryConstants::kiloByte, phEvent, 0, nullptr);
ze_kernel_timestamp_result_t resultTimestamp = {};
auto result = event->queryKernelTimestamp(&resultTimestamp);
EXPECT_EQ(result, ZE_RESULT_SUCCESS);
EXPECT_NE(resultTimestamp.context.kernelEnd, NEO::MockDeviceTimeWithConstTimestamp::GPU_TIMESTAMP);
}
} // namespace ult
} // namespace L0

View File

@ -424,9 +424,12 @@ DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalSetWalkerPartitionCount, 0, "Experim
DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalSetWalkerPartitionType, -1, "Experimental implementation: Set COMPUTE_WALKER Partition Type. Valid values for types from 1 to 3")
DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalEnableCustomLocalMemoryAlignment, 0, "Align local memory allocations to a given value. Works only with allocations at least as big as the value. 0: no effect, 2097152: 2 megabytes, 1073741824: 1 gigabyte")
DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalEnableDeviceAllocationCache, -1, "Experimentally enable allocation cache.")
DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalH2DCpuCopyThreshold, -1, "Override default treshold (in bytes) for H2D CPU copy.")
DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalD2HCpuCopyThreshold, -1, "Override default treshold (in bytes) for D2H CPU copy.")
DECLARE_DEBUG_VARIABLE(bool, ExperimentalEnableSourceLevelDebugger, false, "Experimentally enable source level debugger.")
DECLARE_DEBUG_VARIABLE(bool, ExperimentalEnableL0DebuggerForOpenCL, false, "Experimentally enable debugging OCL with L0 Debug API.")
DECLARE_DEBUG_VARIABLE(bool, ExperimentalEnableTileAttach, false, "Experimentally enable attaching to tiles (subdevices).")
DECLARE_DEBUG_VARIABLE(bool, ExperimentalCopyThroughLock, false, "Experimentally copy memory through locked ptr.")
/*DRIVER TOGGLES*/
DECLARE_DEBUG_VARIABLE(bool, UseMaxSimdSizeToDeduceMaxWorkgroupSize, false, "With this flag on, max workgroup size is deduced using SIMD32 instead of SIMD8, this causes the max wkg size to be 4 times bigger")

View File

@ -158,6 +158,7 @@ class HwHelper {
virtual bool isPlatformFlushTaskEnabled(const NEO::HardwareInfo &hwInfo) const = 0;
virtual bool isPatIndexFallbackWaRequired() const = 0;
virtual uint32_t getMinimalScratchSpaceSize() const = 0;
virtual bool copyThroughLockedPtrEnabled() const = 0;
protected:
HwHelper() = default;
@ -399,6 +400,7 @@ class HwHelperHw : public HwHelper {
bool isPlatformFlushTaskEnabled(const NEO::HardwareInfo &hwInfo) const override;
bool isPatIndexFallbackWaRequired() const override;
uint32_t getMinimalScratchSpaceSize() const override;
bool copyThroughLockedPtrEnabled() const override;
protected:
static const AuxTranslationMode defaultAuxTranslationMode;

View File

@ -718,4 +718,9 @@ bool HwHelperHw<GfxFamily>::isPatIndexFallbackWaRequired() const {
return false;
}
template <typename gfxProduct>
bool HwHelperHw<gfxProduct>::copyThroughLockedPtrEnabled() const {
return false;
}
} // namespace NEO

View File

@ -442,6 +442,11 @@ bool HwHelperHw<Family>::isPatIndexFallbackWaRequired() const {
return true;
}
template <>
bool HwHelperHw<Family>::copyThroughLockedPtrEnabled() const {
return DebugManager.flags.ExperimentalCopyThroughLock.get();
}
} // namespace NEO
#include "shared/source/helpers/hw_helper_pvc_and_later.inl"

View File

@ -458,4 +458,7 @@ ExperimentalEnableTileAttach = 0
DirectSubmissionDisablePrefetcher = -1
ForceDefaultGrfCompilationMode = 0
ForceLargeGrfCompilationMode = 0
ForceStatelessMocsEncryptionBit = -1
ForceStatelessMocsEncryptionBit = -1
ExperimentalCopyThroughLock = 0
ExperimentalH2DCpuCopyThreshold = -1
ExperimentalD2HCpuCopyThreshold = -1

View File

@ -6,6 +6,7 @@
*/
#include "shared/source/helpers/compiler_hw_info_config.h"
#include "shared/source/helpers/hw_helper.h"
#include "shared/source/os_interface/hw_info_config.h"
#include "shared/test/common/helpers/debug_manager_state_restore.h"
#include "shared/test/common/test_macros/test.h"
@ -196,3 +197,8 @@ HWTEST2_F(HwInfoConfigTest, givenHwInfoConfigWhenIsPlatformQueryNotSupportedThen
const auto &hwInfoConfig = *HwInfoConfig::get(productFamily);
EXPECT_FALSE(hwInfoConfig.isPlatformQuerySupported());
}
HWTEST_F(HwInfoConfigTest, givenHwHelperWhenCallCopyThroughLockedPtrEnabledThenReturnFalse) {
HwHelper &hwHelper = HwHelper::get(defaultHwInfo->platform.eRenderCoreFamily);
EXPECT_FALSE(hwHelper.copyThroughLockedPtrEnabled());
}

View File

@ -6,6 +6,7 @@
*/
#include "shared/source/helpers/hw_helper.h"
#include "shared/test/common/helpers/debug_manager_state_restore.h"
#include "shared/test/common/helpers/default_hw_info.h"
#include "shared/test/common/helpers/hw_helper_tests.h"
#include "shared/test/common/test_macros/header/per_product_test_definitions.h"
@ -73,3 +74,14 @@ XE_HPC_CORETEST_F(HwHelperXeHpcCoreTest, givenXeHPCPlatformWhenCheckAssignEngine
auto &hwHelper = HwHelperHw<FamilyType>::get();
EXPECT_EQ(hwHelper.isAssignEngineRoundRobinSupported(hwInfo), HwInfoConfig::get(hwInfo.platform.eProductFamily)->isAssignEngineRoundRobinSupported());
}
XE_HPC_CORETEST_F(HwHelperTest, givenHwHelperWithFlagSetWhenCallCopyThroughLockedPtrEnabledThenReturnFalse) {
DebugManagerStateRestore restore;
auto &hwHelper = HwHelperHw<FamilyType>::get();
DebugManager.flags.ExperimentalCopyThroughLock.set(false);
EXPECT_FALSE(hwHelper.copyThroughLockedPtrEnabled());
DebugManager.flags.ExperimentalCopyThroughLock.set(true);
EXPECT_TRUE(hwHelper.copyThroughLockedPtrEnabled());
}