fix: tbx page fault manager hang issue

- Updated `isAllocTbxFaultable` to exclude `gpuTimestampDeviceBuffer` from being
faultable.
- Replaced `SpinLock` with `RecursiveSpinLock` in `CpuPageFaultManager` and
`TbxPageFaultManager` to allow recursive locking.
- Added unit tests to verify the correct handling of `gpuTimestampDeviceBuffer`
in `TbxCommandStreamTests`.

Related-To: NEO-13748
Signed-off-by: Jack Myers <jack.myers@intel.com>
This commit is contained in:
Jack Myers
2025-02-03 18:04:37 +00:00
committed by Compute-Runtime-Automation
parent b7ba71df1c
commit 7d4e70a25b
12 changed files with 33 additions and 29 deletions

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2018-2024 Intel Corporation
* Copyright (C) 2018-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -41,9 +41,8 @@ bool AubHelper::isOneTimeAubWritableAllocationType(const AllocationType &type) {
case AllocationType::assertBuffer:
case AllocationType::tagBuffer:
case AllocationType::syncDispatchToken:
return true;
case AllocationType::bufferHostMemory:
return NEO::debugManager.flags.SetBufferHostMemoryAlwaysAubWritable.get() ? false : true;
return (NEO::debugManager.flags.SetBufferHostMemoryAlwaysAubWritable.get() ? false : true) || (NEO::debugManager.flags.EnableTbxPageFaultManager.get() == 1);
default:
return false;
}

View File

@@ -89,7 +89,7 @@ bool TbxCommandStreamReceiverHw<GfxFamily>::isAllocTbxFaultable(GraphicsAllocati
return false;
}
auto allocType = gfxAlloc->getAllocationType();
return AubHelper::isOneTimeAubWritableAllocationType(allocType) && GraphicsAllocation::isLockable(allocType);
return AubHelper::isOneTimeAubWritableAllocationType(allocType) && GraphicsAllocation::isLockable(allocType) && allocType != AllocationType::gpuTimestampDeviceBuffer;
}
template <typename GfxFamily>

View File

@@ -41,7 +41,7 @@ DECLARE_DEBUG_VARIABLE(bool, AUBDumpAllocsOnEnqueueSVMMemcpyOnly, false, "Force
DECLARE_DEBUG_VARIABLE(bool, AUBDumpForceAllToLocalMemory, false, "Force placing every allocation in local memory address space")
DECLARE_DEBUG_VARIABLE(bool, GenerateAubFilePerProcessId, true, "Generate aub file with process id")
DECLARE_DEBUG_VARIABLE(bool, SetBufferHostMemoryAlwaysAubWritable, false, "Make buffer host memory allocation always uploaded to AUB/TBX")
DECLARE_DEBUG_VARIABLE(bool, EnableTbxPageFaultManager, false, "Enables experiemental page fault manager for host buffer types, improves upon SetBufferHostMemoryAlwaysAubWritable")
DECLARE_DEBUG_VARIABLE(bool, EnableTbxPageFaultManager, false, "Enables experimental page fault manager for host buffers and some other alloc types, replaces SetBufferHostMemoryAlwaysAubWritable")
/*DEBUG FLAGS*/
DECLARE_DEBUG_VARIABLE(bool, EnableSWTags, false, "Enable software tagging in batch buffer")

View File

@@ -21,7 +21,7 @@ void CpuPageFaultManager::insertAllocation(void *ptr, size_t size, SVMAllocsMana
auto initialPlacement = MemoryPropertiesHelper::getUSMInitialPlacement(memoryProperties);
const auto domain = (initialPlacement == GraphicsAllocation::UsmInitialPlacement::CPU) ? AllocationDomain::cpu : AllocationDomain::none;
std::unique_lock<SpinLock> lock{mtx};
std::unique_lock<RecursiveSpinLock> lock{mtx};
PageFaultData faultData{};
faultData.size = size;
faultData.unifiedMemoryManager = unifiedMemoryManager;
@@ -35,7 +35,7 @@ void CpuPageFaultManager::insertAllocation(void *ptr, size_t size, SVMAllocsMana
}
void CpuPageFaultManager::removeAllocation(void *ptr) {
std::unique_lock<SpinLock> lock{mtx};
std::unique_lock<RecursiveSpinLock> lock{mtx};
auto alloc = memoryData.find(ptr);
if (alloc != memoryData.end()) {
auto &pageFaultData = alloc->second;
@@ -52,7 +52,7 @@ void CpuPageFaultManager::removeAllocation(void *ptr) {
}
void CpuPageFaultManager::moveAllocationToGpuDomain(void *ptr) {
std::unique_lock<SpinLock> lock{mtx};
std::unique_lock<RecursiveSpinLock> lock{mtx};
auto alloc = memoryData.find(ptr);
if (alloc != memoryData.end()) {
auto &pageFaultData = alloc->second;
@@ -68,7 +68,7 @@ void CpuPageFaultManager::moveAllocationToGpuDomain(void *ptr) {
}
void CpuPageFaultManager::moveAllocationsWithinUMAllocsManagerToGpuDomain(SVMAllocsManager *unifiedMemoryManager) {
std::unique_lock<SpinLock> lock{mtx};
std::unique_lock<RecursiveSpinLock> lock{mtx};
for (auto allocPtr : unifiedMemoryManager->nonGpuDomainAllocs) {
auto &pageFaultData = this->memoryData[allocPtr];
this->migrateStorageToGpuDomain(allocPtr, pageFaultData);
@@ -108,7 +108,7 @@ void CpuPageFaultManager::handlePageFault(void *ptr, PageFaultData &faultData) {
}
bool CpuPageFaultManager::verifyAndHandlePageFault(void *ptr, bool handleFault) {
std::unique_lock<SpinLock> lock{mtx};
std::unique_lock<RecursiveSpinLock> lock{mtx};
auto allocPtr = getFaultData(memoryData, ptr, handleFault);
if (allocPtr == nullptr) {
return false;

View File

@@ -98,6 +98,6 @@ class CpuPageFaultManager : public NonCopyableClass {
gpuDomainHandlerType gpuDomainHandler = &transferAndUnprotectMemory;
std::unordered_map<void *, PageFaultData> memoryData;
SpinLock mtx;
RecursiveSpinLock mtx;
};
} // namespace NEO

View File

@@ -12,9 +12,6 @@
namespace NEO {
class TbxPageFaultManagerLinux final : public PageFaultManagerLinux, public TbxPageFaultManager {
public:
TbxPageFaultManagerLinux() : PageFaultManagerLinux(), TbxPageFaultManager() {}
};
class TbxPageFaultManagerLinux final : public PageFaultManagerLinux, public TbxPageFaultManager {};
} // namespace NEO

View File

@@ -14,7 +14,7 @@
namespace NEO {
bool TbxPageFaultManager::verifyAndHandlePageFault(void *ptr, bool handleFault) {
std::unique_lock<SpinLock> lock{mtxTbx};
std::unique_lock<RecursiveSpinLock> lock{mtxTbx};
auto allocPtr = getFaultData(memoryDataTbx, ptr, handleFault);
if (allocPtr == nullptr) {
return CpuPageFaultManager::verifyAndHandlePageFault(ptr, handleFault);
@@ -43,7 +43,7 @@ void TbxPageFaultManager::handlePageFault(void *ptr, PageFaultDataTbx &faultData
}
void TbxPageFaultManager::removeAllocation(GraphicsAllocation *alloc) {
std::unique_lock<SpinLock> lock{mtxTbx};
std::unique_lock<RecursiveSpinLock> lock{mtxTbx};
for (auto &data : memoryDataTbx) {
auto allocPtr = data.first;
auto faultData = data.second;
@@ -56,7 +56,7 @@ void TbxPageFaultManager::removeAllocation(GraphicsAllocation *alloc) {
}
void TbxPageFaultManager::insertAllocation(CommandStreamReceiver *csr, GraphicsAllocation *alloc, uint32_t bank, void *ptr, size_t size) {
std::unique_lock<SpinLock> lock{mtxTbx};
std::unique_lock<RecursiveSpinLock> lock{mtxTbx};
if (this->memoryDataTbx.find(ptr) == this->memoryDataTbx.end()) {
PageFaultDataTbx pageFaultData{};

View File

@@ -38,7 +38,7 @@ class TbxPageFaultManager : public virtual CpuPageFaultManager {
void handlePageFault(void *ptr, PageFaultDataTbx &faultData);
std::unordered_map<void *, PageFaultDataTbx> memoryDataTbx;
SpinLock mtxTbx;
RecursiveSpinLock mtxTbx;
};
} // namespace NEO

View File

@@ -15,9 +15,6 @@
namespace NEO {
class TbxPageFaultManagerWindows final : public PageFaultManagerWindows, public TbxPageFaultManager {
public:
TbxPageFaultManagerWindows() : PageFaultManagerWindows(), TbxPageFaultManager() {}
};
class TbxPageFaultManagerWindows final : public PageFaultManagerWindows, public TbxPageFaultManager {};
} // namespace NEO

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2018-2020 Intel Corporation
* Copyright (C) 2018-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -11,4 +11,5 @@
namespace NEO {
using SpinLock = std::mutex;
using RecursiveSpinLock = std::recursive_mutex;
} // namespace NEO

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2018-2024 Intel Corporation
* Copyright (C) 2018-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -129,12 +129,18 @@ TEST(AubHelper, givenAllocationTypeWhenAskingIfOneTimeWritableThenReturnCorrectR
}
}
TEST(AubHelper, givenSetBufferHostMemoryAlwaysAubWritableWhenAskingIfBufferHostMemoryAllocationIsOneTimeAubWritableThenReturnCorrectResult) {
TEST(AubHelper, givenSetBufferHostMemoryAlwaysAubWritableAndDisabledTbxFaultMngrWhenAskingIfBufferHostMemoryAllocationIsOneTimeAubWritableThenReturnCorrectResult) {
DebugManagerStateRestore stateRestore;
NEO::debugManager.flags.EnableTbxPageFaultManager.set(0);
for (auto isAlwaysAubWritable : {false, true}) {
NEO::debugManager.flags.SetBufferHostMemoryAlwaysAubWritable.set(isAlwaysAubWritable);
EXPECT_NE(AubHelper::isOneTimeAubWritableAllocationType(AllocationType::bufferHostMemory), isAlwaysAubWritable);
for (auto isTbxFaultManagerEnabled : {false, true}) {
NEO::debugManager.flags.SetBufferHostMemoryAlwaysAubWritable.set(isAlwaysAubWritable);
NEO::debugManager.flags.EnableTbxPageFaultManager.set(isTbxFaultManagerEnabled);
bool isOneTimeAubWritable = AubHelper::isOneTimeAubWritableAllocationType(AllocationType::bufferHostMemory);
EXPECT_EQ(!isAlwaysAubWritable || isTbxFaultManagerEnabled, isOneTimeAubWritable);
}
}
}

View File

@@ -1589,8 +1589,12 @@ HWTEST_F(TbxCommandStreamTests, givenAubOneTimeWritableAllocWhenTbxFaultManagerI
for (const auto &allocType : onceWritableAllocTypesForTbx) {
gfxAlloc1->setAllocationType(allocType);
if (GraphicsAllocation::isLockable(allocType)) {
if (allocType == AllocationType::gpuTimestampDeviceBuffer) {
EXPECT_FALSE(tbxCsr->isAllocTbxFaultable(gfxAlloc1));
} else if (GraphicsAllocation::isLockable(allocType)) {
EXPECT_TRUE(tbxCsr->isAllocTbxFaultable(gfxAlloc1));
} else {
EXPECT_FALSE(tbxCsr->isAllocTbxFaultable(gfxAlloc1));
}
}