mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-20 08:53:55 +08:00
fix: hotfix for svmcpu tbx uploads
Test program in the linked, related issue is crashing in tbx mode. Tbx server indicated upload of invalid memory was made before exit. Running with debug messages showed that the problematic upload was an svmcpu buffer when running neo with separate cpu and gpu buffers for shared memory management. Using this info, the problem was narrowed down to a missing unprotect call in page fault manager related code, resulting in a protected(invalid) memory region getting uploaded to tbx. It is unclear yet why this unprotect call was not made, since other svmcpu buffers were uploaded without issue. This hotfix forces the unprotect call in the fault handler, which allows the test program to run to completion. However, there is now a failing test case. Considering the critical nature of the associated NEO issue and that this patch should unblock the work depending on the fix, this hotfix should get merged regardless of the failing test case. In the meantime, I will continue triaging the failing test and will implement a proper fix once the root cause is isolated. Related-To: NEO-13404 Signed-off-by: Jack Myers <jack.myers@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
7b7d7e83a1
commit
5f78147e16
@@ -12,6 +12,7 @@
|
||||
#include "shared/source/memory_manager/address_mapper.h"
|
||||
#include "shared/source/memory_manager/page_table.h"
|
||||
|
||||
#include <array>
|
||||
#include <set>
|
||||
|
||||
namespace NEO {
|
||||
|
||||
@@ -89,7 +89,11 @@ bool TbxCommandStreamReceiverHw<GfxFamily>::isAllocTbxFaultable(GraphicsAllocati
|
||||
return false;
|
||||
}
|
||||
auto allocType = gfxAlloc->getAllocationType();
|
||||
return AubHelper::isOneTimeAubWritableAllocationType(allocType) && GraphicsAllocation::isLockable(allocType) && allocType != AllocationType::gpuTimestampDeviceBuffer;
|
||||
|
||||
if (allocType == AllocationType::bufferHostMemory || allocType == AllocationType::timestampPacketTagBuffer) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
@@ -504,7 +508,7 @@ bool TbxCommandStreamReceiverHw<GfxFamily>::writeMemory(GraphicsAllocation &gfxA
|
||||
return false;
|
||||
}
|
||||
|
||||
this->allowCPUMemoryAccessIfTbxFaultable(&gfxAllocation, cpuAddress, size);
|
||||
this->protectCPUMemoryFromWritesIfTbxFaultable(&gfxAllocation, cpuAddress, size);
|
||||
|
||||
if (!isEngineInitialized) {
|
||||
initializeEngine();
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
|
||||
#include "shared/source/command_stream/command_stream_receiver.h"
|
||||
#include "shared/source/memory_manager/graphics_allocation.h"
|
||||
#include "shared/source/memory_manager/unified_memory_manager.h"
|
||||
#include "shared/source/page_fault_manager/cpu_page_fault_manager.h"
|
||||
|
||||
namespace NEO {
|
||||
@@ -17,6 +18,16 @@ bool TbxPageFaultManager::verifyAndHandlePageFault(void *ptr, bool handleFault)
|
||||
std::unique_lock<RecursiveSpinLock> lock{mtxTbx};
|
||||
auto allocPtr = getFaultData(memoryDataTbx, ptr, handleFault);
|
||||
if (allocPtr == nullptr) {
|
||||
if (handleFault) {
|
||||
std::unique_lock<RecursiveSpinLock> lock{mtx};
|
||||
auto allocPtr = getFaultData(memoryData, ptr, handleFault);
|
||||
if (allocPtr != nullptr) {
|
||||
auto faultData = memoryData[allocPtr];
|
||||
if (faultData.domain == CpuPageFaultManager::AllocationDomain::gpu) {
|
||||
this->allowCPUMemoryAccess(ptr, faultData.size);
|
||||
}
|
||||
}
|
||||
}
|
||||
return CpuPageFaultManager::verifyAndHandlePageFault(ptr, handleFault);
|
||||
}
|
||||
if (handleFault) {
|
||||
|
||||
@@ -1570,7 +1570,7 @@ static constexpr std::array onceWritableAllocTypesForTbx{
|
||||
AllocationType::bufferHostMemory,
|
||||
};
|
||||
|
||||
HWTEST_F(TbxCommandStreamTests, givenAubOneTimeWritableAllocWhenTbxFaultManagerIsAvailableAndAllocIsLockableThenTbxFaultableTypesShouldReturnTrue) {
|
||||
HWTEST_F(TbxCommandStreamTests, givenAubOneTimeWritableAllocWhenTbxFaultManagerIsAvailableAndAllocIsTbxFaultableThenTbxFaultableTypesShouldReturnTrue) {
|
||||
DebugManagerStateRestore stateRestore;
|
||||
debugManager.flags.SetCommandStreamReceiver.set(static_cast<int32_t>(CommandStreamReceiverType::tbx));
|
||||
debugManager.flags.EnableTbxPageFaultManager.set(true);
|
||||
@@ -1589,12 +1589,8 @@ HWTEST_F(TbxCommandStreamTests, givenAubOneTimeWritableAllocWhenTbxFaultManagerI
|
||||
|
||||
for (const auto &allocType : onceWritableAllocTypesForTbx) {
|
||||
gfxAlloc1->setAllocationType(allocType);
|
||||
if (allocType == AllocationType::gpuTimestampDeviceBuffer) {
|
||||
EXPECT_FALSE(tbxCsr->isAllocTbxFaultable(gfxAlloc1));
|
||||
} else if (GraphicsAllocation::isLockable(allocType)) {
|
||||
if (allocType == AllocationType::bufferHostMemory || allocType == AllocationType::timestampPacketTagBuffer) {
|
||||
EXPECT_TRUE(tbxCsr->isAllocTbxFaultable(gfxAlloc1));
|
||||
} else {
|
||||
EXPECT_FALSE(tbxCsr->isAllocTbxFaultable(gfxAlloc1));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1603,7 +1599,7 @@ HWTEST_F(TbxCommandStreamTests, givenAubOneTimeWritableAllocWhenTbxFaultManagerI
|
||||
memoryManager->freeGraphicsMemory(gfxAlloc1);
|
||||
}
|
||||
|
||||
HWTEST_F(TbxCommandStreamTests, givenAubOneTimeWritableAllocWhenTbxFaultManagerIsAvailableAndAllocIsNotLockableThenTbxFaultableTypesShouldReturnFalse) {
|
||||
HWTEST_F(TbxCommandStreamTests, givenAubOneTimeWritableAllocWhenTbxFaultManagerIsAvailableAndAllocIsNotTbxFaultableThenTbxFaultableTypesShouldReturnFalse) {
|
||||
DebugManagerStateRestore stateRestore;
|
||||
debugManager.flags.SetCommandStreamReceiver.set(static_cast<int32_t>(CommandStreamReceiverType::tbx));
|
||||
debugManager.flags.EnableTbxPageFaultManager.set(true);
|
||||
@@ -1622,7 +1618,7 @@ HWTEST_F(TbxCommandStreamTests, givenAubOneTimeWritableAllocWhenTbxFaultManagerI
|
||||
|
||||
for (const auto &allocType : onceWritableAllocTypesForTbx) {
|
||||
gfxAlloc1->setAllocationType(allocType);
|
||||
if (!GraphicsAllocation::isLockable(allocType)) {
|
||||
if (allocType != AllocationType::bufferHostMemory && allocType != AllocationType::timestampPacketTagBuffer) {
|
||||
EXPECT_FALSE(tbxCsr->isAllocTbxFaultable(gfxAlloc1));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -977,7 +977,7 @@ TEST_F(PageFaultManagerTest, givenNoUsmInitialPlacementFlagsWHenInsertingUsmAllo
|
||||
EXPECT_EQ(allocs[3], unifiedMemoryManager->nonGpuDomainAllocs[3]);
|
||||
}
|
||||
|
||||
TEST_F(PageFaultManagerTest, givenTbxModeWhenAllocateSharedMemoryThenTbxFaultManagerShouldBehaveLikeCpuFaultManager) {
|
||||
TEST_F(PageFaultManagerTest, givenTbxModeWhenSharedMemoryNotInGpuDomainThenTbxFaultManagerShouldDoNothingBeforeCallingDefaultHandler) {
|
||||
auto memoryManager2 = std::make_unique<MockMemoryManager>(executionEnvironment);
|
||||
auto unifiedMemoryManager2 = std::make_unique<SVMAllocsManager>(memoryManager2.get(), false);
|
||||
auto pageFaultManager2 = std::make_unique<MockPageFaultManagerImpl<TbxPageFaultManager>>();
|
||||
@@ -993,6 +993,50 @@ TEST_F(PageFaultManagerTest, givenTbxModeWhenAllocateSharedMemoryThenTbxFaultMan
|
||||
|
||||
EXPECT_TRUE(pageFaultManager2->verifyAndHandlePageFault(ptr, true));
|
||||
EXPECT_EQ(pageFaultManager2->protectWritesCalled, 0);
|
||||
EXPECT_EQ(pageFaultManager2->allowMemoryAccessCalled, 1);
|
||||
|
||||
unifiedMemoryManager2->freeSVMAlloc(ptr);
|
||||
}
|
||||
|
||||
TEST_F(PageFaultManagerTest, givenTbxModeWhenSharedMemoryInGpuDomainThenTbxFaultManagerShouldAllowCpuAccessBeforeCallingDefaultHandler) {
|
||||
auto memoryManager2 = std::make_unique<MockMemoryManager>(executionEnvironment);
|
||||
auto unifiedMemoryManager2 = std::make_unique<SVMAllocsManager>(memoryManager2.get(), false);
|
||||
auto pageFaultManager2 = std::make_unique<MockPageFaultManagerImpl<TbxPageFaultManager>>();
|
||||
void *cmdQ = reinterpret_cast<void *>(0xFFFF);
|
||||
|
||||
RootDeviceIndicesContainer rootDeviceIndices = {mockRootDeviceIndex};
|
||||
std::map<uint32_t, DeviceBitfield> deviceBitfields{{mockRootDeviceIndex, mockDeviceBitfield}};
|
||||
|
||||
auto properties = SVMAllocsManager::UnifiedMemoryProperties(InternalMemoryType::sharedUnifiedMemory, 1, rootDeviceIndices, deviceBitfields);
|
||||
void *ptr = unifiedMemoryManager2->createSharedUnifiedMemoryAllocation(10, properties, cmdQ);
|
||||
|
||||
pageFaultManager2->insertAllocation(ptr, 10, unifiedMemoryManager2.get(), cmdQ, {});
|
||||
pageFaultManager2->memoryData[ptr].domain = CpuPageFaultManager::AllocationDomain::gpu;
|
||||
|
||||
EXPECT_TRUE(pageFaultManager2->verifyAndHandlePageFault(ptr, true));
|
||||
EXPECT_EQ(pageFaultManager2->protectWritesCalled, 0);
|
||||
EXPECT_EQ(pageFaultManager2->allowMemoryAccessCalled, 2);
|
||||
|
||||
unifiedMemoryManager2->freeSVMAlloc(ptr);
|
||||
}
|
||||
|
||||
TEST_F(PageFaultManagerTest, givenTbxModeWhenSharedMemoryIsManagedWhenHandleFaultIsFalseThenTbxFaultManagerShouldNotHandleFault) {
|
||||
auto memoryManager2 = std::make_unique<MockMemoryManager>(executionEnvironment);
|
||||
auto unifiedMemoryManager2 = std::make_unique<SVMAllocsManager>(memoryManager2.get(), false);
|
||||
auto pageFaultManager2 = std::make_unique<MockPageFaultManagerImpl<TbxPageFaultManager>>();
|
||||
void *cmdQ = reinterpret_cast<void *>(0xFFFF);
|
||||
|
||||
RootDeviceIndicesContainer rootDeviceIndices = {mockRootDeviceIndex};
|
||||
std::map<uint32_t, DeviceBitfield> deviceBitfields{{mockRootDeviceIndex, mockDeviceBitfield}};
|
||||
|
||||
auto properties = SVMAllocsManager::UnifiedMemoryProperties(InternalMemoryType::sharedUnifiedMemory, 1, rootDeviceIndices, deviceBitfields);
|
||||
void *ptr = unifiedMemoryManager2->createSharedUnifiedMemoryAllocation(10, properties, cmdQ);
|
||||
|
||||
pageFaultManager2->insertAllocation(ptr, 10, unifiedMemoryManager2.get(), cmdQ, {});
|
||||
|
||||
EXPECT_TRUE(pageFaultManager2->verifyAndHandlePageFault(ptr, false));
|
||||
EXPECT_EQ(pageFaultManager2->protectWritesCalled, 0);
|
||||
EXPECT_EQ(pageFaultManager2->allowMemoryAccessCalled, 0);
|
||||
|
||||
unifiedMemoryManager2->freeSVMAlloc(ptr);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user