mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-06 02:18:05 +08:00
fix: regression caused by tbx fault mngr
Addresses regressions from the reverted merge of the tbx fault manager for host memory. Recursive locking of mutex caused deadlock. To fix, separate tbx fault data from base cpu fault data, allowing separate mutexes for each, eliminating recursive locks on the same mutex. By separating, we also help ensure that tbx-related changes don't affect the original cpu fault manager code paths. As an added safe guard preventing critical regressions and avoiding another auto-revert, the tbx fault manager is hidden behind a new debug flag which is disabled by default. Related-To: NEO-12268 Signed-off-by: Jack Myers <jack.myers@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
b8157a2547
commit
7f9fadc314
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2020-2024 Intel Corporation
|
||||
* Copyright (C) 2020-2025 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -196,6 +196,6 @@ struct DeviceImp : public Device, NEO::NonCopyableOrMovableClass {
|
||||
std::unique_ptr<DebugSession> debugSession;
|
||||
};
|
||||
|
||||
void transferAndUnprotectMemoryWithHints(NEO::PageFaultManager *pageFaultHandler, void *allocPtr, NEO::PageFaultManager::PageFaultData &pageFaultData);
|
||||
void transferAndUnprotectMemoryWithHints(NEO::CpuPageFaultManager *pageFaultHandler, void *allocPtr, NEO::CpuPageFaultManager::PageFaultData &pageFaultData);
|
||||
|
||||
} // namespace L0
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2020-2024 Intel Corporation
|
||||
* Copyright (C) 2020-2025 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -16,7 +16,7 @@
|
||||
#include "level_zero/core/source/driver/driver_handle_imp.h"
|
||||
|
||||
namespace NEO {
|
||||
void PageFaultManager::transferToCpu(void *ptr, size_t size, void *device) {
|
||||
void CpuPageFaultManager::transferToCpu(void *ptr, size_t size, void *device) {
|
||||
L0::DeviceImp *deviceImp = static_cast<L0::DeviceImp *>(device);
|
||||
deviceImp->getNEODevice()->stopDirectSubmissionForCopyEngine();
|
||||
|
||||
@@ -29,7 +29,7 @@ void PageFaultManager::transferToCpu(void *ptr, size_t size, void *device) {
|
||||
allocData->size, true);
|
||||
UNRECOVERABLE_IF(ret);
|
||||
}
|
||||
void PageFaultManager::transferToGpu(void *ptr, void *device) {
|
||||
void CpuPageFaultManager::transferToGpu(void *ptr, void *device) {
|
||||
L0::DeviceImp *deviceImp = static_cast<L0::DeviceImp *>(device);
|
||||
deviceImp->getNEODevice()->stopDirectSubmissionForCopyEngine();
|
||||
|
||||
@@ -44,7 +44,7 @@ void PageFaultManager::transferToGpu(void *ptr, void *device) {
|
||||
|
||||
this->evictMemoryAfterImplCopy(allocData->cpuAllocation, deviceImp->getNEODevice());
|
||||
}
|
||||
void PageFaultManager::allowCPUMemoryEviction(bool evict, void *ptr, PageFaultData &pageFaultData) {
|
||||
void CpuPageFaultManager::allowCPUMemoryEviction(bool evict, void *ptr, PageFaultData &pageFaultData) {
|
||||
L0::DeviceImp *deviceImp = static_cast<L0::DeviceImp *>(pageFaultData.cmdQ);
|
||||
|
||||
CommandStreamReceiver *csr = nullptr;
|
||||
@@ -61,9 +61,9 @@ void PageFaultManager::allowCPUMemoryEviction(bool evict, void *ptr, PageFaultDa
|
||||
} // namespace NEO
|
||||
|
||||
namespace L0 {
|
||||
void transferAndUnprotectMemoryWithHints(NEO::PageFaultManager *pageFaultHandler, void *allocPtr, NEO::PageFaultManager::PageFaultData &pageFaultData) {
|
||||
void transferAndUnprotectMemoryWithHints(NEO::CpuPageFaultManager *pageFaultHandler, void *allocPtr, NEO::CpuPageFaultManager::PageFaultData &pageFaultData) {
|
||||
bool migration = true;
|
||||
if (pageFaultData.domain == NEO::PageFaultManager::AllocationDomain::gpu) {
|
||||
if (pageFaultData.domain == NEO::CpuPageFaultManager::AllocationDomain::gpu) {
|
||||
L0::DeviceImp *deviceImp = static_cast<L0::DeviceImp *>(pageFaultData.cmdQ);
|
||||
NEO::SvmAllocationData *allocData = deviceImp->getDriverHandle()->getSvmAllocsManager()->getSVMAlloc(allocPtr);
|
||||
|
||||
@@ -87,7 +87,7 @@ void transferAndUnprotectMemoryWithHints(NEO::PageFaultManager *pageFaultHandler
|
||||
}
|
||||
}
|
||||
if (migration) {
|
||||
pageFaultData.domain = NEO::PageFaultManager::AllocationDomain::cpu;
|
||||
pageFaultData.domain = NEO::CpuPageFaultManager::AllocationDomain::cpu;
|
||||
}
|
||||
pageFaultHandler->allowCPUMemoryAccess(allocPtr, pageFaultData.size);
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2020-2024 Intel Corporation
|
||||
* Copyright (C) 2020-2025 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -545,9 +545,9 @@ TEST_F(CommandListMemAdvisePageFault, givenValidPtrAndPageFaultHandlerAndGpuDoma
|
||||
|
||||
EXPECT_EQ(handlerWithHints, reinterpret_cast<void *>(mockPageFaultManager->gpuDomainHandler));
|
||||
|
||||
NEO::PageFaultManager::PageFaultData pageData;
|
||||
NEO::CpuPageFaultManager::PageFaultData pageData;
|
||||
pageData.cmdQ = deviceImp;
|
||||
pageData.domain = NEO::PageFaultManager::AllocationDomain::gpu;
|
||||
pageData.domain = NEO::CpuPageFaultManager::AllocationDomain::gpu;
|
||||
mockPageFaultManager->gpuDomainHandler(mockPageFaultManager, ptr, pageData);
|
||||
flags = deviceImp->memAdviseSharedAllocations[allocData];
|
||||
EXPECT_EQ(1, flags.cpuMigrationBlocked);
|
||||
@@ -586,9 +586,9 @@ TEST_F(CommandListMemAdvisePageFault, givenValidPtrAndPageFaultHandlerAndGpuDoma
|
||||
|
||||
EXPECT_EQ(handlerWithHints, reinterpret_cast<void *>(mockPageFaultManager->gpuDomainHandler));
|
||||
|
||||
NEO::PageFaultManager::PageFaultData pageData;
|
||||
NEO::CpuPageFaultManager::PageFaultData pageData;
|
||||
pageData.cmdQ = deviceImp;
|
||||
pageData.domain = NEO::PageFaultManager::AllocationDomain::gpu;
|
||||
pageData.domain = NEO::CpuPageFaultManager::AllocationDomain::gpu;
|
||||
pageData.unifiedMemoryManager = device->getDriverHandle()->getSvmAllocsManager();
|
||||
EXPECT_EQ(0u, device->getDriverHandle()->getSvmAllocsManager()->nonGpuDomainAllocs.size());
|
||||
mockPageFaultManager->gpuDomainHandler(mockPageFaultManager, ptr, pageData);
|
||||
@@ -661,9 +661,9 @@ TEST_F(CommandListMemAdvisePageFault, givenValidPtrAndPageFaultHandlerAndGpuDoma
|
||||
|
||||
testing::internal::CaptureStdout(); // start capturing
|
||||
|
||||
NEO::PageFaultManager::PageFaultData pageData;
|
||||
NEO::CpuPageFaultManager::PageFaultData pageData;
|
||||
pageData.cmdQ = deviceImp;
|
||||
pageData.domain = NEO::PageFaultManager::AllocationDomain::gpu;
|
||||
pageData.domain = NEO::CpuPageFaultManager::AllocationDomain::gpu;
|
||||
pageData.unifiedMemoryManager = device->getDriverHandle()->getSvmAllocsManager();
|
||||
mockPageFaultManager->gpuDomainHandler(mockPageFaultManager, ptr, pageData);
|
||||
flags = deviceImp->memAdviseSharedAllocations[allocData];
|
||||
@@ -715,9 +715,9 @@ TEST_F(CommandListMemAdvisePageFault, givenValidPtrAndPageFaultHandlerAndGpuDoma
|
||||
|
||||
EXPECT_EQ(handlerWithHints, reinterpret_cast<void *>(mockPageFaultManager->gpuDomainHandler));
|
||||
|
||||
NEO::PageFaultManager::PageFaultData pageData;
|
||||
NEO::CpuPageFaultManager::PageFaultData pageData;
|
||||
pageData.cmdQ = deviceImp;
|
||||
pageData.domain = NEO::PageFaultManager::AllocationDomain::gpu;
|
||||
pageData.domain = NEO::CpuPageFaultManager::AllocationDomain::gpu;
|
||||
pageData.unifiedMemoryManager = device->getDriverHandle()->getSvmAllocsManager();
|
||||
mockPageFaultManager->gpuDomainHandler(mockPageFaultManager, ptr, pageData);
|
||||
flags = deviceImp->memAdviseSharedAllocations[allocData];
|
||||
@@ -762,9 +762,9 @@ TEST_F(CommandListMemAdvisePageFault, givenValidPtrAndPageFaultHandlerAndGpuDoma
|
||||
|
||||
EXPECT_EQ(handlerWithHints, reinterpret_cast<void *>(mockPageFaultManager->gpuDomainHandler));
|
||||
|
||||
NEO::PageFaultManager::PageFaultData pageData;
|
||||
NEO::CpuPageFaultManager::PageFaultData pageData;
|
||||
pageData.cmdQ = deviceImp;
|
||||
pageData.domain = NEO::PageFaultManager::AllocationDomain::cpu;
|
||||
pageData.domain = NEO::CpuPageFaultManager::AllocationDomain::cpu;
|
||||
pageData.unifiedMemoryManager = device->getDriverHandle()->getSvmAllocsManager();
|
||||
mockPageFaultManager->gpuDomainHandler(mockPageFaultManager, ptr, pageData);
|
||||
flags = deviceImp->memAdviseSharedAllocations[allocData];
|
||||
@@ -809,9 +809,9 @@ TEST_F(CommandListMemAdvisePageFault, givenInvalidPtrAndPageFaultHandlerAndGpuDo
|
||||
|
||||
EXPECT_EQ(handlerWithHints, reinterpret_cast<void *>(mockPageFaultManager->gpuDomainHandler));
|
||||
|
||||
NEO::PageFaultManager::PageFaultData pageData;
|
||||
NEO::CpuPageFaultManager::PageFaultData pageData;
|
||||
pageData.cmdQ = deviceImp;
|
||||
pageData.domain = NEO::PageFaultManager::AllocationDomain::gpu;
|
||||
pageData.domain = NEO::CpuPageFaultManager::AllocationDomain::gpu;
|
||||
pageData.unifiedMemoryManager = device->getDriverHandle()->getSvmAllocsManager();
|
||||
void *alloc = reinterpret_cast<void *>(0x1);
|
||||
mockPageFaultManager->gpuDomainHandler(mockPageFaultManager, alloc, pageData);
|
||||
@@ -838,7 +838,7 @@ TEST_F(CommandListMemAdvisePageFault, givenUnifiedMemoryAllocWhenAllowCPUMemoryE
|
||||
|
||||
L0::DeviceImp *deviceImp = static_cast<L0::DeviceImp *>((L0::Device::fromHandle(device)));
|
||||
|
||||
NEO::PageFaultManager::PageFaultData pageData;
|
||||
NEO::CpuPageFaultManager::PageFaultData pageData;
|
||||
pageData.cmdQ = deviceImp;
|
||||
|
||||
mockPageFaultManager->baseAllowCPUMemoryEviction(true, ptr, pageData);
|
||||
|
||||
Reference in New Issue
Block a user