feature: Set shared system USM madvise preferred location as prefetch region

Resolves: NEO-16482

Signed-off-by: John Falkowski <john.falkowski@intel.com>
This commit is contained in:
John Falkowski
2025-11-01 21:36:13 +00:00
committed by Compute-Runtime-Automation
parent 0f5381456d
commit 6b63304673
8 changed files with 184 additions and 23 deletions

View File

@@ -452,6 +452,7 @@ DECLARE_DEBUG_VARIABLE(int64_t, ForceGmmSystemMemoryBufferForAllocations, 0, "0:
DECLARE_DEBUG_VARIABLE(int32_t, ForceLowLatencyHint, -1, "Force passing low latency hint during xe_exec_queue creation. -1: default, 0: disabled, 1: enabled");
DECLARE_DEBUG_VARIABLE(int32_t, EmitMemAdvisePriorToCopyForNonUsm, -1, "Enable Memadvise to system memory for copy/fill with shared system input: -1: default, 0: disabled, 1: enabled")
DECLARE_DEBUG_VARIABLE(int32_t, TreatNonUsmForTransfersAsSharedSystem, -1, "-1: default, 0: import non-usm as external host ptr on copy/fill (legacy mode), 1: treat non usm on copy/fill as shared system usm")
DECLARE_DEBUG_VARIABLE(int32_t, OverrideMadviseSharedSystemPrefetchRegion, -1, "-1: default (madvise), 0: system memory, 1: same-tile local memory")
/*DIRECT SUBMISSION FLAGS*/
DECLARE_DEBUG_VARIABLE(int32_t, EnableDirectSubmission, -1, "-1: default (disabled), 0: disable, 1:enable. Enables direct submission of command buffers bypassing KMD")

View File

@@ -372,7 +372,7 @@ bool DrmMemoryManager::prefetchSharedSystemAlloc(const void *ptr, const size_t s
auto memoryClassDevice = ioctlHelper->getDrmParamValue(DrmParam::memoryClassDevice);
auto region = static_cast<uint32_t>((memoryClassDevice << 16u) | subDeviceIds[0]);
auto vmId = drm.getVirtualMemoryAddressSpace(subDeviceIds[0]);
return ioctlHelper->setVmPrefetch(reinterpret_cast<uint64_t>(ptr), size, region, vmId);
return ioctlHelper->setVmSharedSystemMemPrefetch(reinterpret_cast<uint64_t>(ptr), size, region, vmId);
}
bool DrmMemoryManager::setMemPrefetch(GraphicsAllocation *gfxAllocation, SubDeviceIdsVec &subDeviceIds, uint32_t rootDeviceIndex) {

View File

@@ -137,6 +137,7 @@ class IoctlHelper {
virtual AtomicAccessMode getVmSharedSystemAtomicAttribute(uint64_t handle, const size_t size, const uint32_t vmId) { return AtomicAccessMode::none; }
virtual bool setVmBoAdviseForChunking(int32_t handle, uint64_t start, uint64_t length, uint32_t attribute, void *region) = 0;
virtual bool setVmPrefetch(uint64_t start, uint64_t length, uint32_t region, uint32_t vmId) = 0;
virtual bool setVmSharedSystemMemPrefetch(uint64_t start, uint64_t length, uint32_t region, uint32_t vmId) { return true; }
virtual bool setGemTiling(void *setTiling) = 0;
virtual bool getGemTiling(void *setTiling) = 0;
virtual uint32_t getDirectSubmissionFlag() = 0;

View File

@@ -994,6 +994,47 @@ bool IoctlHelperXe::setVmPrefetch(uint64_t start, uint64_t length, uint32_t regi
return true;
}
bool IoctlHelperXe::setVmSharedSystemMemPrefetch(uint64_t start, uint64_t length, uint32_t region, uint32_t vmId) {
xeLog(" -> IoctlHelperXe::%s s=0x%llx l=0x%llx align_s=0x%llx align_l=0x%llx vmid=0x%x\n", __FUNCTION__, start, length, alignDown(start, MemoryConstants::pageSize), alignSizeWholePage(reinterpret_cast<void *>(start), length), vmId);
drm_xe_vm_bind bind = {};
bind.vm_id = vmId;
bind.num_binds = 1;
bind.bind.range = alignSizeWholePage(reinterpret_cast<void *>(start), length);
bind.bind.addr = alignDown(start, MemoryConstants::pageSize);
bind.bind.op = DRM_XE_VM_BIND_OP_PREFETCH;
auto pHwInfo = this->drm.getRootDeviceEnvironment().getHardwareInfo();
if (debugManager.flags.OverrideMadviseSharedSystemPrefetchRegion.get() != -1) {
constexpr uint32_t subDeviceMaskSize = DeviceBitfield().size();
constexpr uint32_t subDeviceMaskMax = (1u << subDeviceMaskSize) - 1u;
uint32_t subDeviceId = region & subDeviceMaskMax;
DeviceBitfield subDeviceMask = (debugManager.flags.OverrideMadviseSharedSystemPrefetchRegion.get() << subDeviceId);
MemoryClassInstance regionInstanceClass = this->drm.getMemoryInfo()->getMemoryRegionClassAndInstance(subDeviceMask, *pHwInfo);
bind.bind.prefetch_mem_region_instance = regionInstanceClass.memoryInstance;
} else {
bind.bind.prefetch_mem_region_instance = DRM_XE_CONSULT_MEM_ADVISE_PREF_LOC;
}
int ret = IoctlHelper::ioctl(DrmIoctl::gemVmBind, &bind);
xeLog(" vm=%d addr=0x%lx range=0x%lx region=0x%x operation=%d(%s) ret=%d\n",
bind.vm_id,
bind.bind.addr,
bind.bind.range,
bind.bind.prefetch_mem_region_instance,
bind.bind.op,
xeGetBindOperationName(bind.bind.op),
ret);
if (ret != 0) {
xeLog("error: %s ret=%d\n", xeGetBindOperationName(bind.bind.op), ret);
return false;
}
return true;
}
uint32_t IoctlHelperXe::getDirectSubmissionFlag() {
xeLog(" -> IoctlHelperXe::%s\n", __FUNCTION__);
return 0;

View File

@@ -64,6 +64,7 @@ class IoctlHelperXe : public IoctlHelper {
AtomicAccessMode getVmSharedSystemAtomicAttribute(uint64_t handle, const size_t size, const uint32_t vmId) override;
bool setVmBoAdviseForChunking(int32_t handle, uint64_t start, uint64_t length, uint32_t attribute, void *region) override;
bool setVmPrefetch(uint64_t start, uint64_t length, uint32_t region, uint32_t vmId) override;
bool setVmSharedSystemMemPrefetch(uint64_t start, uint64_t length, uint32_t region, uint32_t vmId) override;
bool setGemTiling(void *setTiling) override;
bool getGemTiling(void *setTiling) override;
uint32_t getDirectSubmissionFlag() override;

View File

@@ -650,6 +650,7 @@ EnableDeferBacking = 0
ForceLowLatencyHint = -1
EmitMemAdvisePriorToCopyForNonUsm = -1
TreatNonUsmForTransfersAsSharedSystem = -1
OverrideMadviseSharedSystemPrefetchRegion = -1
SetMaxBVHLevels = -1
GetSipBinaryFromExternalLib = -1
LogUsmReuse = 0

View File

@@ -6472,46 +6472,36 @@ HWTEST_TEMPLATED_F(DrmMemoryManagerTest, givenDrmMemoryManagerWhenSetMemPrefetch
}
HWTEST_TEMPLATED_F(DrmMemoryManagerTest, givenPrefetchSharedSystemAllocIsCalledThenReturnTrue) {
SubDeviceIdsVec subDeviceIds{0};
class MyMockIoctlHelper : public MockIoctlHelper {
public:
using MockIoctlHelper::MockIoctlHelper;
bool setVmPrefetch(uint64_t start, uint64_t length, uint32_t region, uint32_t vmId) override {
return true;
}
};
auto mockIoctlHelper = new MyMockIoctlHelper(*mock);
auto &drm = static_cast<DrmMockCustom &>(memoryManager->getDrm(mockRootDeviceIndex));
drm.ioctlHelper.reset(mockIoctlHelper);
auto ptr = malloc(1024);
EXPECT_TRUE(memoryManager->prefetchSharedSystemAlloc(ptr, 1024, subDeviceIds, rootDeviceIndex));
void *ptr = malloc(1024);
auto subDeviceIds = NEO::SubDeviceIdsVec{0};
EXPECT_TRUE(memoryManager->prefetchSharedSystemAlloc(ptr, 1024, subDeviceIds, mockRootDeviceIndex));
free(ptr);
}
HWTEST_TEMPLATED_F(DrmMemoryManagerTest, givenPrefetchSharedSystemAllocIsCalledThenReturnFalse) {
SubDeviceIdsVec subDeviceIds{0};
class MyMockIoctlHelper : public MockIoctlHelper {
public:
using MockIoctlHelper::MockIoctlHelper;
bool setVmPrefetch(uint64_t start, uint64_t length, uint32_t region, uint32_t vmId) override {
public:
bool setVmSharedSystemMemPrefetch(uint64_t start, uint64_t length, uint32_t region, uint32_t vmId) override {
setVmSharedSystemMemPrefetchCalled++;
return false;
}
uint32_t setVmSharedSystemMemPrefetchCalled = 0;
};
auto mockIoctlHelper = new MyMockIoctlHelper(*mock);
auto &drm = static_cast<DrmMockCustom &>(memoryManager->getDrm(mockRootDeviceIndex));
drm.ioctlHelper.reset(mockIoctlHelper);
auto ptr = malloc(1024);
EXPECT_TRUE(memoryManager->prefetchSharedSystemAlloc(ptr, 1024, subDeviceIds, rootDeviceIndex));
void *ptr = malloc(1024);
auto subDeviceIds = NEO::SubDeviceIdsVec{0};
EXPECT_FALSE(memoryManager->prefetchSharedSystemAlloc(ptr, 1024, subDeviceIds, mockRootDeviceIndex));
EXPECT_EQ(1u, mockIoctlHelper->setVmSharedSystemMemPrefetchCalled);
free(ptr);
}

View File

@@ -3044,6 +3044,132 @@ TEST_F(IoctlHelperXeTest, givenIoctlHelperXeWhenCallingSetVmPrefetchThenVmBindIs
EXPECT_EQ(drm->vmBindInputs[0].bind.prefetch_mem_region_instance, targetMemoryRegion.memoryInstance);
}
struct DrmMockXePrefetchFail : public DrmMockXe {
static auto create(RootDeviceEnvironment &rootDeviceEnvironment) {
auto drm = std::unique_ptr<DrmMockXePrefetchFail>(new DrmMockXePrefetchFail{rootDeviceEnvironment});
drm->initInstance();
return drm;
}
int ioctl(DrmIoctl request, void *arg) override {
if (request == DrmIoctl::gemVmBind) {
return -1;
}
return DrmMockXe::ioctl(request, arg);
};
int gemVmAdviseReturn = 0;
StackVec<drm_xe_madvise, 4> vmAdviseInputs;
protected:
// Don't call directly, use the create() function
DrmMockXePrefetchFail(RootDeviceEnvironment &rootDeviceEnvironment) : DrmMockXe(rootDeviceEnvironment) {}
};
TEST_F(IoctlHelperXeTest, givenIoctlHelperXeWhenCallingSetVmSharedSystemMemPrefetchThenFailureIsReturned) {
auto executionEnvironment = std::make_unique<MockExecutionEnvironment>();
auto drm = DrmMockXePrefetchFail::create(*executionEnvironment->rootDeviceEnvironments[0]);
auto xeIoctlHelper = static_cast<MockIoctlHelperXe *>(drm->getIoctlHelper());
uint64_t start = 0x12u;
uint64_t length = 0x34u;
uint32_t subDeviceId = 0u;
uint32_t vmId = 1u;
int memoryClassDevice = static_cast<int>(DrmParam::memoryClassDevice);
uint32_t region = (memoryClassDevice << 16u) | subDeviceId;
EXPECT_FALSE(xeIoctlHelper->setVmSharedSystemMemPrefetch(start, length, region, vmId));
}
TEST_F(IoctlHelperXeTest, givenIoctlHelperXeWhenCallingSetVmSharedSystemMemPrefetchThenMemRegionInstanceIsMemAdvisePreferredLocation) {
DebugManagerStateRestore restorer;
debugManager.flags.EnableLocalMemory.set(1);
auto executionEnvironment = std::make_unique<MockExecutionEnvironment>();
auto drm = DrmMockXe::create(*executionEnvironment->rootDeviceEnvironments[0]);
auto xeIoctlHelper = static_cast<MockIoctlHelperXe *>(drm->getIoctlHelper());
xeIoctlHelper->initialize();
uint64_t start = 0x12u;
uint64_t length = 0x34u;
uint32_t subDeviceId = 0u;
uint32_t vmId = 1u;
int memoryClassDevice = static_cast<int>(DrmParam::memoryClassDevice);
uint32_t region = (memoryClassDevice << 16u) | subDeviceId;
EXPECT_TRUE(xeIoctlHelper->setVmSharedSystemMemPrefetch(start, length, region, vmId));
EXPECT_EQ(1u, drm->vmBindInputs.size());
EXPECT_EQ(drm->vmBindInputs[0].vm_id, vmId);
EXPECT_EQ(drm->vmBindInputs[0].bind.addr, alignDown(start, MemoryConstants::pageSize));
EXPECT_EQ(drm->vmBindInputs[0].bind.range, alignSizeWholePage(reinterpret_cast<void *>(start), length));
EXPECT_EQ(drm->vmBindInputs[0].bind.prefetch_mem_region_instance, ((uint64_t)DRM_XE_CONSULT_MEM_ADVISE_PREF_LOC) & 0xffffffff);
}
TEST_F(IoctlHelperXeTest, givenIoctlHelperXeWhenCallingSetVmSharedSystemMemPrefetchWithDebugVarThenMemRegionInstanceIsDeviceLocal) {
DebugManagerStateRestore restorer;
debugManager.flags.EnableLocalMemory.set(1);
debugManager.flags.OverrideMadviseSharedSystemPrefetchRegion.set(1);
auto executionEnvironment = std::make_unique<MockExecutionEnvironment>();
auto drm = DrmMockXe::create(*executionEnvironment->rootDeviceEnvironments[0]);
auto xeIoctlHelper = static_cast<MockIoctlHelperXe *>(drm->getIoctlHelper());
xeIoctlHelper->initialize();
uint64_t start = 0x12u;
uint64_t length = 0x34u;
uint32_t subDeviceId = 0u;
uint32_t vmId = 1u;
auto memoryInfo = xeIoctlHelper->createMemoryInfo();
ASSERT_NE(nullptr, memoryInfo);
MemoryClassInstance targetMemoryRegion = memoryInfo->getLocalMemoryRegions()[subDeviceId].region;
drm->memoryInfo.reset(memoryInfo.release());
int memoryClassDevice = static_cast<int>(DrmParam::memoryClassDevice);
uint32_t region = (memoryClassDevice << 16u) | subDeviceId;
EXPECT_TRUE(xeIoctlHelper->setVmSharedSystemMemPrefetch(start, length, region, vmId));
EXPECT_EQ(1u, drm->vmBindInputs.size());
EXPECT_EQ(drm->vmBindInputs[0].vm_id, vmId);
EXPECT_EQ(drm->vmBindInputs[0].bind.addr, alignDown(start, MemoryConstants::pageSize));
EXPECT_EQ(drm->vmBindInputs[0].bind.range, alignSizeWholePage(reinterpret_cast<void *>(start), length));
EXPECT_EQ(drm->vmBindInputs[0].bind.prefetch_mem_region_instance, targetMemoryRegion.memoryInstance);
}
TEST_F(IoctlHelperXeTest, givenIoctlHelperXeWhenCallingSetVmSharedSystemMemPrefetchWithDebugVarThenMemRegionInstanceIsSystem) {
DebugManagerStateRestore restorer;
debugManager.flags.EnableLocalMemory.set(1);
debugManager.flags.OverrideMadviseSharedSystemPrefetchRegion.set(0);
auto executionEnvironment = std::make_unique<MockExecutionEnvironment>();
auto drm = DrmMockXe::create(*executionEnvironment->rootDeviceEnvironments[0]);
auto xeIoctlHelper = static_cast<MockIoctlHelperXe *>(drm->getIoctlHelper());
xeIoctlHelper->initialize();
uint64_t start = 0x12u;
uint64_t length = 0x34u;
uint32_t subDeviceId = 0u;
uint32_t vmId = 1u;
auto memoryInfo = xeIoctlHelper->createMemoryInfo();
ASSERT_NE(nullptr, memoryInfo);
drm->memoryInfo.reset(memoryInfo.release());
int memoryClassDevice = static_cast<int>(DrmParam::memoryClassDevice);
uint32_t region = (memoryClassDevice << 16u) | subDeviceId;
EXPECT_TRUE(xeIoctlHelper->setVmSharedSystemMemPrefetch(start, length, region, vmId));
EXPECT_EQ(1u, drm->vmBindInputs.size());
EXPECT_EQ(drm->vmBindInputs[0].vm_id, vmId);
EXPECT_EQ(drm->vmBindInputs[0].bind.addr, alignDown(start, MemoryConstants::pageSize));
EXPECT_EQ(drm->vmBindInputs[0].bind.range, alignSizeWholePage(reinterpret_cast<void *>(start), length));
EXPECT_EQ(drm->vmBindInputs[0].bind.prefetch_mem_region_instance, 0u);
}
TEST_F(IoctlHelperXeTest, givenIoctlHelperXeWhenCallingSetVmPrefetchOnSecondTileThenVmBindIsCalled) {
DebugManagerStateRestore restorer;
debugManager.flags.EnableLocalMemory.set(1);