mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-20 08:53:55 +08:00
fix: tbx page fault manager hang issue
- Updated `isAllocTbxFaultable` to exclude `gpuTimestampDeviceBuffer` from being faultable. - Replaced `SpinLock` with `RecursiveSpinLock` in `CpuPageFaultManager` and `TbxPageFaultManager` to allow recursive locking. - Added unit tests to verify the correct handling of `gpuTimestampDeviceBuffer` in `TbxCommandStreamTests`. Related-To: NEO-13748 Signed-off-by: Jack Myers <jack.myers@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
b7ba71df1c
commit
7d4e70a25b
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2018-2024 Intel Corporation
|
||||
* Copyright (C) 2018-2025 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -41,9 +41,8 @@ bool AubHelper::isOneTimeAubWritableAllocationType(const AllocationType &type) {
|
||||
case AllocationType::assertBuffer:
|
||||
case AllocationType::tagBuffer:
|
||||
case AllocationType::syncDispatchToken:
|
||||
return true;
|
||||
case AllocationType::bufferHostMemory:
|
||||
return NEO::debugManager.flags.SetBufferHostMemoryAlwaysAubWritable.get() ? false : true;
|
||||
return (NEO::debugManager.flags.SetBufferHostMemoryAlwaysAubWritable.get() ? false : true) || (NEO::debugManager.flags.EnableTbxPageFaultManager.get() == 1);
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -89,7 +89,7 @@ bool TbxCommandStreamReceiverHw<GfxFamily>::isAllocTbxFaultable(GraphicsAllocati
|
||||
return false;
|
||||
}
|
||||
auto allocType = gfxAlloc->getAllocationType();
|
||||
return AubHelper::isOneTimeAubWritableAllocationType(allocType) && GraphicsAllocation::isLockable(allocType);
|
||||
return AubHelper::isOneTimeAubWritableAllocationType(allocType) && GraphicsAllocation::isLockable(allocType) && allocType != AllocationType::gpuTimestampDeviceBuffer;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
|
||||
@@ -41,7 +41,7 @@ DECLARE_DEBUG_VARIABLE(bool, AUBDumpAllocsOnEnqueueSVMMemcpyOnly, false, "Force
|
||||
DECLARE_DEBUG_VARIABLE(bool, AUBDumpForceAllToLocalMemory, false, "Force placing every allocation in local memory address space")
|
||||
DECLARE_DEBUG_VARIABLE(bool, GenerateAubFilePerProcessId, true, "Generate aub file with process id")
|
||||
DECLARE_DEBUG_VARIABLE(bool, SetBufferHostMemoryAlwaysAubWritable, false, "Make buffer host memory allocation always uploaded to AUB/TBX")
|
||||
DECLARE_DEBUG_VARIABLE(bool, EnableTbxPageFaultManager, false, "Enables experiemental page fault manager for host buffer types, improves upon SetBufferHostMemoryAlwaysAubWritable")
|
||||
DECLARE_DEBUG_VARIABLE(bool, EnableTbxPageFaultManager, false, "Enables experimental page fault manager for host buffers and some other alloc types, replaces SetBufferHostMemoryAlwaysAubWritable")
|
||||
|
||||
/*DEBUG FLAGS*/
|
||||
DECLARE_DEBUG_VARIABLE(bool, EnableSWTags, false, "Enable software tagging in batch buffer")
|
||||
|
||||
@@ -21,7 +21,7 @@ void CpuPageFaultManager::insertAllocation(void *ptr, size_t size, SVMAllocsMana
|
||||
auto initialPlacement = MemoryPropertiesHelper::getUSMInitialPlacement(memoryProperties);
|
||||
const auto domain = (initialPlacement == GraphicsAllocation::UsmInitialPlacement::CPU) ? AllocationDomain::cpu : AllocationDomain::none;
|
||||
|
||||
std::unique_lock<SpinLock> lock{mtx};
|
||||
std::unique_lock<RecursiveSpinLock> lock{mtx};
|
||||
PageFaultData faultData{};
|
||||
faultData.size = size;
|
||||
faultData.unifiedMemoryManager = unifiedMemoryManager;
|
||||
@@ -35,7 +35,7 @@ void CpuPageFaultManager::insertAllocation(void *ptr, size_t size, SVMAllocsMana
|
||||
}
|
||||
|
||||
void CpuPageFaultManager::removeAllocation(void *ptr) {
|
||||
std::unique_lock<SpinLock> lock{mtx};
|
||||
std::unique_lock<RecursiveSpinLock> lock{mtx};
|
||||
auto alloc = memoryData.find(ptr);
|
||||
if (alloc != memoryData.end()) {
|
||||
auto &pageFaultData = alloc->second;
|
||||
@@ -52,7 +52,7 @@ void CpuPageFaultManager::removeAllocation(void *ptr) {
|
||||
}
|
||||
|
||||
void CpuPageFaultManager::moveAllocationToGpuDomain(void *ptr) {
|
||||
std::unique_lock<SpinLock> lock{mtx};
|
||||
std::unique_lock<RecursiveSpinLock> lock{mtx};
|
||||
auto alloc = memoryData.find(ptr);
|
||||
if (alloc != memoryData.end()) {
|
||||
auto &pageFaultData = alloc->second;
|
||||
@@ -68,7 +68,7 @@ void CpuPageFaultManager::moveAllocationToGpuDomain(void *ptr) {
|
||||
}
|
||||
|
||||
void CpuPageFaultManager::moveAllocationsWithinUMAllocsManagerToGpuDomain(SVMAllocsManager *unifiedMemoryManager) {
|
||||
std::unique_lock<SpinLock> lock{mtx};
|
||||
std::unique_lock<RecursiveSpinLock> lock{mtx};
|
||||
for (auto allocPtr : unifiedMemoryManager->nonGpuDomainAllocs) {
|
||||
auto &pageFaultData = this->memoryData[allocPtr];
|
||||
this->migrateStorageToGpuDomain(allocPtr, pageFaultData);
|
||||
@@ -108,7 +108,7 @@ void CpuPageFaultManager::handlePageFault(void *ptr, PageFaultData &faultData) {
|
||||
}
|
||||
|
||||
bool CpuPageFaultManager::verifyAndHandlePageFault(void *ptr, bool handleFault) {
|
||||
std::unique_lock<SpinLock> lock{mtx};
|
||||
std::unique_lock<RecursiveSpinLock> lock{mtx};
|
||||
auto allocPtr = getFaultData(memoryData, ptr, handleFault);
|
||||
if (allocPtr == nullptr) {
|
||||
return false;
|
||||
|
||||
@@ -98,6 +98,6 @@ class CpuPageFaultManager : public NonCopyableClass {
|
||||
gpuDomainHandlerType gpuDomainHandler = &transferAndUnprotectMemory;
|
||||
|
||||
std::unordered_map<void *, PageFaultData> memoryData;
|
||||
SpinLock mtx;
|
||||
RecursiveSpinLock mtx;
|
||||
};
|
||||
} // namespace NEO
|
||||
|
||||
@@ -12,9 +12,6 @@
|
||||
|
||||
namespace NEO {
|
||||
|
||||
class TbxPageFaultManagerLinux final : public PageFaultManagerLinux, public TbxPageFaultManager {
|
||||
public:
|
||||
TbxPageFaultManagerLinux() : PageFaultManagerLinux(), TbxPageFaultManager() {}
|
||||
};
|
||||
class TbxPageFaultManagerLinux final : public PageFaultManagerLinux, public TbxPageFaultManager {};
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
namespace NEO {
|
||||
|
||||
bool TbxPageFaultManager::verifyAndHandlePageFault(void *ptr, bool handleFault) {
|
||||
std::unique_lock<SpinLock> lock{mtxTbx};
|
||||
std::unique_lock<RecursiveSpinLock> lock{mtxTbx};
|
||||
auto allocPtr = getFaultData(memoryDataTbx, ptr, handleFault);
|
||||
if (allocPtr == nullptr) {
|
||||
return CpuPageFaultManager::verifyAndHandlePageFault(ptr, handleFault);
|
||||
@@ -43,7 +43,7 @@ void TbxPageFaultManager::handlePageFault(void *ptr, PageFaultDataTbx &faultData
|
||||
}
|
||||
|
||||
void TbxPageFaultManager::removeAllocation(GraphicsAllocation *alloc) {
|
||||
std::unique_lock<SpinLock> lock{mtxTbx};
|
||||
std::unique_lock<RecursiveSpinLock> lock{mtxTbx};
|
||||
for (auto &data : memoryDataTbx) {
|
||||
auto allocPtr = data.first;
|
||||
auto faultData = data.second;
|
||||
@@ -56,7 +56,7 @@ void TbxPageFaultManager::removeAllocation(GraphicsAllocation *alloc) {
|
||||
}
|
||||
|
||||
void TbxPageFaultManager::insertAllocation(CommandStreamReceiver *csr, GraphicsAllocation *alloc, uint32_t bank, void *ptr, size_t size) {
|
||||
std::unique_lock<SpinLock> lock{mtxTbx};
|
||||
std::unique_lock<RecursiveSpinLock> lock{mtxTbx};
|
||||
|
||||
if (this->memoryDataTbx.find(ptr) == this->memoryDataTbx.end()) {
|
||||
PageFaultDataTbx pageFaultData{};
|
||||
|
||||
@@ -38,7 +38,7 @@ class TbxPageFaultManager : public virtual CpuPageFaultManager {
|
||||
void handlePageFault(void *ptr, PageFaultDataTbx &faultData);
|
||||
|
||||
std::unordered_map<void *, PageFaultDataTbx> memoryDataTbx;
|
||||
SpinLock mtxTbx;
|
||||
RecursiveSpinLock mtxTbx;
|
||||
};
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
@@ -15,9 +15,6 @@
|
||||
|
||||
namespace NEO {
|
||||
|
||||
class TbxPageFaultManagerWindows final : public PageFaultManagerWindows, public TbxPageFaultManager {
|
||||
public:
|
||||
TbxPageFaultManagerWindows() : PageFaultManagerWindows(), TbxPageFaultManager() {}
|
||||
};
|
||||
class TbxPageFaultManagerWindows final : public PageFaultManagerWindows, public TbxPageFaultManager {};
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2018-2020 Intel Corporation
|
||||
* Copyright (C) 2018-2025 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -11,4 +11,5 @@
|
||||
|
||||
namespace NEO {
|
||||
using SpinLock = std::mutex;
|
||||
using RecursiveSpinLock = std::recursive_mutex;
|
||||
} // namespace NEO
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2018-2024 Intel Corporation
|
||||
* Copyright (C) 2018-2025 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -129,12 +129,18 @@ TEST(AubHelper, givenAllocationTypeWhenAskingIfOneTimeWritableThenReturnCorrectR
|
||||
}
|
||||
}
|
||||
|
||||
TEST(AubHelper, givenSetBufferHostMemoryAlwaysAubWritableWhenAskingIfBufferHostMemoryAllocationIsOneTimeAubWritableThenReturnCorrectResult) {
|
||||
TEST(AubHelper, givenSetBufferHostMemoryAlwaysAubWritableAndDisabledTbxFaultMngrWhenAskingIfBufferHostMemoryAllocationIsOneTimeAubWritableThenReturnCorrectResult) {
|
||||
DebugManagerStateRestore stateRestore;
|
||||
NEO::debugManager.flags.EnableTbxPageFaultManager.set(0);
|
||||
|
||||
for (auto isAlwaysAubWritable : {false, true}) {
|
||||
NEO::debugManager.flags.SetBufferHostMemoryAlwaysAubWritable.set(isAlwaysAubWritable);
|
||||
EXPECT_NE(AubHelper::isOneTimeAubWritableAllocationType(AllocationType::bufferHostMemory), isAlwaysAubWritable);
|
||||
for (auto isTbxFaultManagerEnabled : {false, true}) {
|
||||
NEO::debugManager.flags.SetBufferHostMemoryAlwaysAubWritable.set(isAlwaysAubWritable);
|
||||
NEO::debugManager.flags.EnableTbxPageFaultManager.set(isTbxFaultManagerEnabled);
|
||||
|
||||
bool isOneTimeAubWritable = AubHelper::isOneTimeAubWritableAllocationType(AllocationType::bufferHostMemory);
|
||||
EXPECT_EQ(!isAlwaysAubWritable || isTbxFaultManagerEnabled, isOneTimeAubWritable);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1589,8 +1589,12 @@ HWTEST_F(TbxCommandStreamTests, givenAubOneTimeWritableAllocWhenTbxFaultManagerI
|
||||
|
||||
for (const auto &allocType : onceWritableAllocTypesForTbx) {
|
||||
gfxAlloc1->setAllocationType(allocType);
|
||||
if (GraphicsAllocation::isLockable(allocType)) {
|
||||
if (allocType == AllocationType::gpuTimestampDeviceBuffer) {
|
||||
EXPECT_FALSE(tbxCsr->isAllocTbxFaultable(gfxAlloc1));
|
||||
} else if (GraphicsAllocation::isLockable(allocType)) {
|
||||
EXPECT_TRUE(tbxCsr->isAllocTbxFaultable(gfxAlloc1));
|
||||
} else {
|
||||
EXPECT_FALSE(tbxCsr->isAllocTbxFaultable(gfxAlloc1));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user