fix: hotfix for svmcpu tbx uploads

Test program in the linked, related issue
is crashing in tbx mode. Tbx server indicated
upload of invalid memory was made before exit.

Running with debug messages showed that the
problematic upload was an svmcpu buffer when
running neo with separate cpu and gpu
buffers for shared memory management.

Using this info, the problem was narrowed down
to a missing unprotect call in page fault manager
related code, resulting in a protected(invalid)
memory region getting uploaded to tbx.

It is unclear yet why this unprotect call was not made,
since other svmcpu buffers were uploaded without issue.

This hotfix forces the unprotect call in the fault handler,
which allows the test program to run to completion. However,
there is now a failing test case.

Considering the critical nature of the associated
NEO issue and that this patch should unblock
the work depending on the fix, this hotfix should
get merged regardless of the failing test case.

In the meantime, I will continue triaging the
failing test and will implement a proper fix
once the root cause is isolated.

Related-To: NEO-13404
Signed-off-by: Jack Myers <jack.myers@intel.com>
This commit is contained in:
Jack Myers
2025-03-11 23:44:27 +00:00
committed by Compute-Runtime-Automation
parent 7b7d7e83a1
commit 5f78147e16
5 changed files with 67 additions and 11 deletions

View File

@@ -12,6 +12,7 @@
#include "shared/source/memory_manager/address_mapper.h"
#include "shared/source/memory_manager/page_table.h"
#include <array>
#include <set>
namespace NEO {

View File

@@ -89,7 +89,11 @@ bool TbxCommandStreamReceiverHw<GfxFamily>::isAllocTbxFaultable(GraphicsAllocati
return false;
}
auto allocType = gfxAlloc->getAllocationType();
return AubHelper::isOneTimeAubWritableAllocationType(allocType) && GraphicsAllocation::isLockable(allocType) && allocType != AllocationType::gpuTimestampDeviceBuffer;
if (allocType == AllocationType::bufferHostMemory || allocType == AllocationType::timestampPacketTagBuffer) {
return true;
}
return false;
}
template <typename GfxFamily>
@@ -504,7 +508,7 @@ bool TbxCommandStreamReceiverHw<GfxFamily>::writeMemory(GraphicsAllocation &gfxA
return false;
}
this->allowCPUMemoryAccessIfTbxFaultable(&gfxAllocation, cpuAddress, size);
this->protectCPUMemoryFromWritesIfTbxFaultable(&gfxAllocation, cpuAddress, size);
if (!isEngineInitialized) {
initializeEngine();

View File

@@ -9,6 +9,7 @@
#include "shared/source/command_stream/command_stream_receiver.h"
#include "shared/source/memory_manager/graphics_allocation.h"
#include "shared/source/memory_manager/unified_memory_manager.h"
#include "shared/source/page_fault_manager/cpu_page_fault_manager.h"
namespace NEO {
@@ -17,6 +18,16 @@ bool TbxPageFaultManager::verifyAndHandlePageFault(void *ptr, bool handleFault)
std::unique_lock<RecursiveSpinLock> lock{mtxTbx};
auto allocPtr = getFaultData(memoryDataTbx, ptr, handleFault);
if (allocPtr == nullptr) {
if (handleFault) {
std::unique_lock<RecursiveSpinLock> lock{mtx};
auto allocPtr = getFaultData(memoryData, ptr, handleFault);
if (allocPtr != nullptr) {
auto faultData = memoryData[allocPtr];
if (faultData.domain == CpuPageFaultManager::AllocationDomain::gpu) {
this->allowCPUMemoryAccess(ptr, faultData.size);
}
}
}
return CpuPageFaultManager::verifyAndHandlePageFault(ptr, handleFault);
}
if (handleFault) {

View File

@@ -1570,7 +1570,7 @@ static constexpr std::array onceWritableAllocTypesForTbx{
AllocationType::bufferHostMemory,
};
HWTEST_F(TbxCommandStreamTests, givenAubOneTimeWritableAllocWhenTbxFaultManagerIsAvailableAndAllocIsLockableThenTbxFaultableTypesShouldReturnTrue) {
HWTEST_F(TbxCommandStreamTests, givenAubOneTimeWritableAllocWhenTbxFaultManagerIsAvailableAndAllocIsTbxFaultableThenTbxFaultableTypesShouldReturnTrue) {
DebugManagerStateRestore stateRestore;
debugManager.flags.SetCommandStreamReceiver.set(static_cast<int32_t>(CommandStreamReceiverType::tbx));
debugManager.flags.EnableTbxPageFaultManager.set(true);
@@ -1589,12 +1589,8 @@ HWTEST_F(TbxCommandStreamTests, givenAubOneTimeWritableAllocWhenTbxFaultManagerI
for (const auto &allocType : onceWritableAllocTypesForTbx) {
gfxAlloc1->setAllocationType(allocType);
if (allocType == AllocationType::gpuTimestampDeviceBuffer) {
EXPECT_FALSE(tbxCsr->isAllocTbxFaultable(gfxAlloc1));
} else if (GraphicsAllocation::isLockable(allocType)) {
if (allocType == AllocationType::bufferHostMemory || allocType == AllocationType::timestampPacketTagBuffer) {
EXPECT_TRUE(tbxCsr->isAllocTbxFaultable(gfxAlloc1));
} else {
EXPECT_FALSE(tbxCsr->isAllocTbxFaultable(gfxAlloc1));
}
}
@@ -1603,7 +1599,7 @@ HWTEST_F(TbxCommandStreamTests, givenAubOneTimeWritableAllocWhenTbxFaultManagerI
memoryManager->freeGraphicsMemory(gfxAlloc1);
}
HWTEST_F(TbxCommandStreamTests, givenAubOneTimeWritableAllocWhenTbxFaultManagerIsAvailableAndAllocIsNotLockableThenTbxFaultableTypesShouldReturnFalse) {
HWTEST_F(TbxCommandStreamTests, givenAubOneTimeWritableAllocWhenTbxFaultManagerIsAvailableAndAllocIsNotTbxFaultableThenTbxFaultableTypesShouldReturnFalse) {
DebugManagerStateRestore stateRestore;
debugManager.flags.SetCommandStreamReceiver.set(static_cast<int32_t>(CommandStreamReceiverType::tbx));
debugManager.flags.EnableTbxPageFaultManager.set(true);
@@ -1622,7 +1618,7 @@ HWTEST_F(TbxCommandStreamTests, givenAubOneTimeWritableAllocWhenTbxFaultManagerI
for (const auto &allocType : onceWritableAllocTypesForTbx) {
gfxAlloc1->setAllocationType(allocType);
if (!GraphicsAllocation::isLockable(allocType)) {
if (allocType != AllocationType::bufferHostMemory && allocType != AllocationType::timestampPacketTagBuffer) {
EXPECT_FALSE(tbxCsr->isAllocTbxFaultable(gfxAlloc1));
}
}

View File

@@ -977,7 +977,7 @@ TEST_F(PageFaultManagerTest, givenNoUsmInitialPlacementFlagsWHenInsertingUsmAllo
EXPECT_EQ(allocs[3], unifiedMemoryManager->nonGpuDomainAllocs[3]);
}
TEST_F(PageFaultManagerTest, givenTbxModeWhenAllocateSharedMemoryThenTbxFaultManagerShouldBehaveLikeCpuFaultManager) {
TEST_F(PageFaultManagerTest, givenTbxModeWhenSharedMemoryNotInGpuDomainThenTbxFaultManagerShouldDoNothingBeforeCallingDefaultHandler) {
auto memoryManager2 = std::make_unique<MockMemoryManager>(executionEnvironment);
auto unifiedMemoryManager2 = std::make_unique<SVMAllocsManager>(memoryManager2.get(), false);
auto pageFaultManager2 = std::make_unique<MockPageFaultManagerImpl<TbxPageFaultManager>>();
@@ -993,6 +993,50 @@ TEST_F(PageFaultManagerTest, givenTbxModeWhenAllocateSharedMemoryThenTbxFaultMan
EXPECT_TRUE(pageFaultManager2->verifyAndHandlePageFault(ptr, true));
EXPECT_EQ(pageFaultManager2->protectWritesCalled, 0);
EXPECT_EQ(pageFaultManager2->allowMemoryAccessCalled, 1);
unifiedMemoryManager2->freeSVMAlloc(ptr);
}
TEST_F(PageFaultManagerTest, givenTbxModeWhenSharedMemoryInGpuDomainThenTbxFaultManagerShouldAllowCpuAccessBeforeCallingDefaultHandler) {
auto memoryManager2 = std::make_unique<MockMemoryManager>(executionEnvironment);
auto unifiedMemoryManager2 = std::make_unique<SVMAllocsManager>(memoryManager2.get(), false);
auto pageFaultManager2 = std::make_unique<MockPageFaultManagerImpl<TbxPageFaultManager>>();
void *cmdQ = reinterpret_cast<void *>(0xFFFF);
RootDeviceIndicesContainer rootDeviceIndices = {mockRootDeviceIndex};
std::map<uint32_t, DeviceBitfield> deviceBitfields{{mockRootDeviceIndex, mockDeviceBitfield}};
auto properties = SVMAllocsManager::UnifiedMemoryProperties(InternalMemoryType::sharedUnifiedMemory, 1, rootDeviceIndices, deviceBitfields);
void *ptr = unifiedMemoryManager2->createSharedUnifiedMemoryAllocation(10, properties, cmdQ);
pageFaultManager2->insertAllocation(ptr, 10, unifiedMemoryManager2.get(), cmdQ, {});
pageFaultManager2->memoryData[ptr].domain = CpuPageFaultManager::AllocationDomain::gpu;
EXPECT_TRUE(pageFaultManager2->verifyAndHandlePageFault(ptr, true));
EXPECT_EQ(pageFaultManager2->protectWritesCalled, 0);
EXPECT_EQ(pageFaultManager2->allowMemoryAccessCalled, 2);
unifiedMemoryManager2->freeSVMAlloc(ptr);
}
TEST_F(PageFaultManagerTest, givenTbxModeWhenSharedMemoryIsManagedWhenHandleFaultIsFalseThenTbxFaultManagerShouldNotHandleFault) {
auto memoryManager2 = std::make_unique<MockMemoryManager>(executionEnvironment);
auto unifiedMemoryManager2 = std::make_unique<SVMAllocsManager>(memoryManager2.get(), false);
auto pageFaultManager2 = std::make_unique<MockPageFaultManagerImpl<TbxPageFaultManager>>();
void *cmdQ = reinterpret_cast<void *>(0xFFFF);
RootDeviceIndicesContainer rootDeviceIndices = {mockRootDeviceIndex};
std::map<uint32_t, DeviceBitfield> deviceBitfields{{mockRootDeviceIndex, mockDeviceBitfield}};
auto properties = SVMAllocsManager::UnifiedMemoryProperties(InternalMemoryType::sharedUnifiedMemory, 1, rootDeviceIndices, deviceBitfields);
void *ptr = unifiedMemoryManager2->createSharedUnifiedMemoryAllocation(10, properties, cmdQ);
pageFaultManager2->insertAllocation(ptr, 10, unifiedMemoryManager2.get(), cmdQ, {});
EXPECT_TRUE(pageFaultManager2->verifyAndHandlePageFault(ptr, false));
EXPECT_EQ(pageFaultManager2->protectWritesCalled, 0);
EXPECT_EQ(pageFaultManager2->allowMemoryAccessCalled, 0);
unifiedMemoryManager2->freeSVMAlloc(ptr);
}