Files
compute-runtime/shared/source/page_fault_manager/tbx_page_fault_manager.cpp
Jack Myers 7f9fadc314 fix: regression caused by tbx fault mngr
Addresses regressions from the reverted merge
of the tbx fault manager for host memory.

Recursive locking of mutex caused deadlock.

To fix, separate tbx fault data from base
cpu fault data, allowing separate mutexes
for each, eliminating recursive locks on
the same mutex.

By separating, we also help ensure that tbx-related
changes don't affect the original cpu fault manager code
paths.

As an added safe guard preventing critical regressions
and avoiding another auto-revert, the tbx fault manager
is hidden behind a new debug flag which is disabled by default.

Related-To: NEO-12268
Signed-off-by: Jack Myers <jack.myers@intel.com>
2025-01-09 07:48:53 +01:00

76 lines
2.5 KiB
C++

/*
* Copyright (C) 2024-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/page_fault_manager/tbx_page_fault_manager.h"
#include "shared/source/command_stream/command_stream_receiver.h"
#include "shared/source/memory_manager/graphics_allocation.h"
#include "shared/source/page_fault_manager/cpu_page_fault_manager.h"
namespace NEO {
bool TbxPageFaultManager::verifyAndHandlePageFault(void *ptr, bool handleFault) {
std::unique_lock<SpinLock> lock{mtxTbx};
auto allocPtr = getFaultData(memoryDataTbx, ptr, handleFault);
if (allocPtr == nullptr) {
return CpuPageFaultManager::verifyAndHandlePageFault(ptr, handleFault);
}
if (handleFault) {
handlePageFault(allocPtr, memoryDataTbx[allocPtr]);
}
return true;
}
void TbxPageFaultManager::handlePageFault(void *ptr, PageFaultDataTbx &faultData) {
auto &graphicsAllocation = *faultData.gfxAllocation;
auto bank = faultData.bank;
auto hasBeenDownloaded = faultData.hasBeenDownloaded;
auto size = faultData.size;
auto csr = faultData.csr;
if (!hasBeenDownloaded) {
this->allowCPUMemoryAccess(ptr, size);
csr->downloadAllocation(graphicsAllocation);
this->protectCpuMemoryFromWrites(ptr, size);
faultData.hasBeenDownloaded = true;
} else {
graphicsAllocation.setTbxWritable(true, bank);
this->allowCPUMemoryAccess(ptr, size);
this->memoryDataTbx.erase(ptr);
}
}
void TbxPageFaultManager::removeAllocation(GraphicsAllocation *alloc) {
std::unique_lock<SpinLock> lock{mtxTbx};
for (auto &data : memoryDataTbx) {
auto allocPtr = data.first;
auto faultData = data.second;
if (faultData.gfxAllocation == alloc) {
memoryDataTbx.erase(allocPtr);
this->allowCPUMemoryAccess(allocPtr, faultData.size);
return;
}
}
}
void TbxPageFaultManager::insertAllocation(CommandStreamReceiver *csr, GraphicsAllocation *alloc, uint32_t bank, void *ptr, size_t size) {
std::unique_lock<SpinLock> lock{mtxTbx};
if (this->memoryDataTbx.find(ptr) == this->memoryDataTbx.end()) {
PageFaultDataTbx pageFaultData{};
pageFaultData.size = size;
pageFaultData.gfxAllocation = alloc;
pageFaultData.bank = bank;
pageFaultData.csr = csr;
memoryDataTbx[ptr] = pageFaultData;
}
auto &faultData = this->memoryDataTbx[ptr];
faultData.hasBeenDownloaded = false;
this->protectCPUMemoryAccess(ptr, size);
}
} // namespace NEO