Allocate buffers in local memory for PVC multi root device platforms (1/n)

PVC platform with no support for atomic operations on system memory
must always allocate buffers in local memory to avoid atomic access violation.
Note: the feature is being implemented under the new registry key
AllocateBuffersInLocalMemoryForMultiRootDeviceContexts (disabled by default)

Related-To: NEO-7092

Signed-off-by: Milczarek, Slawomir <slawomir.milczarek@intel.com>
This commit is contained in:
Milczarek, Slawomir
2022-10-24 23:25:04 +00:00
committed by Compute-Runtime-Automation
parent b0c97e49ea
commit 25a5ed0dca
15 changed files with 137 additions and 14 deletions

View File

@ -981,6 +981,9 @@ bool CommandQueue::queueDependenciesClearRequired() const {
bool CommandQueue::blitEnqueueAllowed(const CsrSelectionArgs &args) const {
bool blitEnqueueAllowed = getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled() || this->isCopyOnly;
if (this->getContext().getRootDeviceIndices().size() > 1) {
blitEnqueueAllowed &= !DebugManager.flags.AllocateBuffersInLocalMemoryForMultiRootDeviceContexts.get();
}
if (DebugManager.flags.EnableBlitterForEnqueueOperations.get() != -1) {
blitEnqueueAllowed = DebugManager.flags.EnableBlitterForEnqueueOperations.get();
}

View File

@ -440,6 +440,8 @@ Buffer *Buffer::create(Context *context,
}
}
multiGraphicsAllocation.setMultiStorage(MemoryPropertiesHelper::useMultiStorageForCrossRootDeviceAccess(context->getRootDeviceIndices().size() > 1));
auto rootDeviceIndex = context->getDevice(0u)->getRootDeviceIndex();
auto &allocationInfo = allocationInfos[rootDeviceIndex];
auto memoryStorage = multiGraphicsAllocation.getDefaultGraphicsAllocation()->getUnderlyingBuffer();

View File

@ -27,12 +27,17 @@ void MigrationController::handleMigration(Context &context, CommandStreamReceive
if (migrationSyncData->getCurrentLocation() != targetRootDeviceIndex) {
migrateMemory(context, *memoryManager, memObj, targetRootDeviceIndex);
}
migrationSyncData->signalUsage(targetCsr.getTagAddress(), targetCsr.peekTaskCount() + 1);
if (!context.getSpecialQueue(targetRootDeviceIndex)->isWaitForTimestampsEnabled()) {
migrationSyncData->signalUsage(targetCsr.getTagAddress(), targetCsr.peekTaskCount() + 1);
}
}
void MigrationController::migrateMemory(Context &context, MemoryManager &memoryManager, MemObj *memObj, uint32_t targetRootDeviceIndex) {
auto &multiGraphicsAllocation = memObj->getMultiGraphicsAllocation();
auto migrationSyncData = multiGraphicsAllocation.getMigrationSyncData();
if (migrationSyncData->isMigrationInProgress()) {
return;
}
auto sourceRootDeviceIndex = migrationSyncData->getCurrentLocation();
if (sourceRootDeviceIndex == std::numeric_limits<uint32_t>::max()) {

View File

@ -1689,6 +1689,20 @@ TEST(CommandQueue, givenImageToBufferClCommandWhenCallingBlitEnqueueAllowedThenR
EXPECT_FALSE(queue.blitEnqueueAllowed(args));
}
TEST(CommandQueue, givenAllocateBuffersInLocalMemoryForMultiRootDeviceContextsWhenMultiRootDeviceContextIsCreatedThenWhenBlitEnqueueIsNotAllowed) {
DebugManagerStateRestore restorer;
DebugManager.flags.AllocateBuffersInLocalMemoryForMultiRootDeviceContexts.set(1);
MockDefaultContext context{true};
MockCommandQueue queue(&context, context.getDevice(0), 0, false);
MockGraphicsAllocation alloc{};
ASSERT_TRUE(context.getRootDeviceIndices().size() > 1);
CsrSelectionArgs args{CL_COMMAND_READ_BUFFER, &alloc, &alloc, 0u, nullptr};
EXPECT_FALSE(queue.blitEnqueueAllowed(args));
}
template <bool blitter, bool selectBlitterWithQueueFamilies>
struct CsrSelectionCommandQueueTests : ::testing::Test {
void SetUp() override {

View File

@ -219,7 +219,7 @@ HWTEST_F(EnqueueReadImageTest, givenGpuHangAndCommandQueueAndPtrCopyAllowedForHo
HWTEST_F(EnqueueReadImageTest, givenMultiRootDeviceImageWhenEnqueueReadImageThenKernelRequiresMigration) {
MockDefaultContext context;
MockDefaultContext context{true};
auto pCmdQ1 = createCommandQueue(context.getDevice(0), nullptr, &context);
@ -270,7 +270,7 @@ HWTEST_F(EnqueueReadImageTest, givenMultiRootDeviceImageWhenEnqueueReadImageThen
HWTEST_F(EnqueueReadImageTest, givenMultiRootDeviceImageWhenEnqueueReadImageIsCalledMultipleTimesThenEachKernelUsesDifferentImage) {
MockDefaultContext context;
MockDefaultContext context{true};
auto pCmdQ1 = createCommandQueue(context.getDevice(0), nullptr, &context);
@ -352,7 +352,7 @@ HWTEST_F(EnqueueReadImageTest, givenMultiRootDeviceImageWhenEnqueueReadImageIsCa
}
HWTEST_F(EnqueueReadImageTest, givenMultiRootDeviceImageWhenNonBlockedEnqueueReadImageIsCalledThenCommandQueueIsFlushed) {
MockDefaultContext context;
MockDefaultContext context{true};
auto pCmdQ1 = createCommandQueue(context.getDevice(0), nullptr, &context);
@ -374,7 +374,7 @@ HWTEST_F(EnqueueReadImageTest, givenMultiRootDeviceImageWhenNonBlockedEnqueueRea
HWTEST_F(EnqueueReadImageTest, givenMultiRootDeviceImageWhenNonBlockedEnqueueReadImageIsCalledThenTlbCacheIsInvalidated) {
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
MockDefaultContext context;
MockDefaultContext context{true};
auto pCmdQ1 = createCommandQueue(context.getDevice(0), nullptr, &context);
@ -409,7 +409,7 @@ HWTEST_F(EnqueueReadImageTest, givenMultiRootDeviceImageWhenNonBlockedEnqueueRea
HWTEST_F(EnqueueReadImageTest, givenMultiRootDeviceImageWhenEnqueueReadImageIsCalledToDifferentDevicesThenCorrectLocationIsSet) {
MockDefaultContext context;
MockDefaultContext context{true};
auto pCmdQ1 = createCommandQueue(context.getDevice(0), nullptr, &context);
auto pCmdQ2 = createCommandQueue(context.getDevice(1), nullptr, &context);
@ -473,7 +473,7 @@ HWTEST_F(EnqueueReadImageTest, givenMultiRootDeviceImageWhenEnqueueReadImageIsCa
HWTEST_F(EnqueueReadImageTest, givenImageFromBufferThatRequiresMigrationWhenEnqueueReadImageThenBufferObjectIsTakenForMigration) {
MockDefaultContext context;
MockDefaultContext context{true};
auto pCmdQ1 = createCommandQueue(context.getDevice(0), nullptr, &context);

View File

@ -590,7 +590,7 @@ HWTEST_F(EnqueueWriteImageTest, whenEnqueueWriteImageThenBuiltinKernelIsResolved
HWTEST_F(EnqueueWriteImageTest, givenMultiRootDeviceImageWhenEnqueueWriteImageThenKernelRequiresMigration) {
MockDefaultContext context;
MockDefaultContext context{true};
auto pCmdQ1 = createCommandQueue(context.getDevice(0), nullptr, &context);
@ -641,7 +641,7 @@ HWTEST_F(EnqueueWriteImageTest, givenMultiRootDeviceImageWhenEnqueueWriteImageTh
HWTEST_F(EnqueueWriteImageTest, givenMultiRootDeviceImageWhenEnqueueWriteImageIsCalledMultipleTimesThenEachKernelUsesDifferentImage) {
MockDefaultContext context;
MockDefaultContext context{true};
auto pCmdQ1 = createCommandQueue(context.getDevice(0), nullptr, &context);
@ -723,7 +723,7 @@ HWTEST_F(EnqueueWriteImageTest, givenMultiRootDeviceImageWhenEnqueueWriteImageIs
}
HWTEST_F(EnqueueWriteImageTest, givenMultiRootDeviceImageWhenNonBlockedEnqueueWriteImageIsCalledThenCommandQueueIsFlushed) {
MockDefaultContext context;
MockDefaultContext context{true};
auto pCmdQ1 = createCommandQueue(context.getDevice(0), nullptr, &context);
@ -745,7 +745,7 @@ HWTEST_F(EnqueueWriteImageTest, givenMultiRootDeviceImageWhenNonBlockedEnqueueWr
HWTEST_F(EnqueueWriteImageTest, givenMultiRootDeviceImageWhenNonBlockedEnqueueWriteImageIsCalledThenTlbCacheIsInvalidated) {
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
MockDefaultContext context;
MockDefaultContext context{true};
auto pCmdQ1 = createCommandQueue(context.getDevice(0), nullptr, &context);

View File

@ -557,3 +557,43 @@ TEST_F(MemoryPropertiesHelperTests, givenSubDeviceIdWhenParsingExtraMemoryProper
EXPECT_EQ(0b10u, memoryProperties.pDevice->getDeviceBitfield().to_ulong());
EXPECT_EQ(&context.pSubDevice1->getDevice(), memoryProperties.pDevice);
}
TEST_F(MemoryPropertiesHelperTests, whenQueryingUseSystemMemoryForCrossRootDeviceAccessThenReturnTrueForMultiRootDeviceContexts) {
for (auto multiRootDevice : {false, true}) {
EXPECT_EQ(multiRootDevice, MemoryPropertiesHelper::useSystemMemoryForCrossRootDeviceAccess(multiRootDevice));
}
}
TEST_F(MemoryPropertiesHelperTests, givenAllocateBuffersInLocalMemoryForMultiRootDeviceContextsWhenQueryingUseSystemMemoryForCrossRootDeviceAccessThenReturnFalseForMultiRootDeviceContexts) {
DebugManagerStateRestore restore;
for (auto localMemory : {false, true}) {
DebugManager.flags.AllocateBuffersInLocalMemoryForMultiRootDeviceContexts.set(localMemory);
EXPECT_FALSE(MemoryPropertiesHelper::useSystemMemoryForCrossRootDeviceAccess(false));
}
for (auto localMemory : {false, true}) {
DebugManager.flags.AllocateBuffersInLocalMemoryForMultiRootDeviceContexts.set(localMemory);
EXPECT_NE(localMemory, MemoryPropertiesHelper::useSystemMemoryForCrossRootDeviceAccess(true));
}
}
TEST_F(MemoryPropertiesHelperTests, whenQueryingUseMultiStorageForCrossRootDeviceAccessThenReturnFalseForMultiRootDeviceContexts) {
for (auto multiRootDevice : {false, true}) {
EXPECT_FALSE(MemoryPropertiesHelper::useMultiStorageForCrossRootDeviceAccess(multiRootDevice));
}
}
TEST_F(MemoryPropertiesHelperTests, givenAllocateBuffersInLocalMemoryForMultiRootDeviceContextsWhenQueryingUseMultiStorageForCrossRootDeviceAccessThenReturnTrueForMultiRootDeviceContexts) {
DebugManagerStateRestore restore;
for (auto localMemory : {false, true}) {
DebugManager.flags.AllocateBuffersInLocalMemoryForMultiRootDeviceContexts.set(localMemory);
EXPECT_FALSE(MemoryPropertiesHelper::useMultiStorageForCrossRootDeviceAccess(false));
}
for (auto localMemory : {false, true}) {
DebugManager.flags.AllocateBuffersInLocalMemoryForMultiRootDeviceContexts.set(localMemory);
EXPECT_EQ(localMemory, MemoryPropertiesHelper::useMultiStorageForCrossRootDeviceAccess(true));
}
}

View File

@ -213,3 +213,42 @@ TEST_F(MigrationControllerTests, whenHandleMigrationThenProperTagAddressAndTaskC
EXPECT_EQ(pCsr0->getTagAddress(), migrationSyncData->tagAddress);
EXPECT_EQ(pCsr0->peekTaskCount() + 1, migrationSyncData->latestTaskCountUsed);
}
TEST_F(MigrationControllerTests, givenWaitForTimestampsEnabledWhenHandleMigrationIsCalledThenDontSignalTaskCountBasedUsage) {
DebugManagerStateRestore restorer;
DebugManager.flags.EnableTimestampWaitForQueues.set(4);
VariableBackup<decltype(MultiGraphicsAllocation::createMigrationSyncDataFunc)> createFuncBackup{&MultiGraphicsAllocation::createMigrationSyncDataFunc};
MultiGraphicsAllocation::createMigrationSyncDataFunc = [](size_t size) -> MigrationSyncData * {
return new MockMigrationSyncData(size);
};
std::unique_ptr<Buffer> pBuffer(BufferHelper<>::create(&context));
const_cast<MultiGraphicsAllocation &>(pBuffer->getMultiGraphicsAllocation()).setMultiStorage(true);
ASSERT_TRUE(pBuffer->getMultiGraphicsAllocation().requiresMigrations());
auto migrationSyncData = static_cast<MockMigrationSyncData *>(pBuffer->getMultiGraphicsAllocation().getMigrationSyncData());
MigrationController::handleMigration(context, *pCsr0, pBuffer.get());
EXPECT_EQ(0u, migrationSyncData->signalUsageCalled);
}
TEST_F(MigrationControllerTests, whenMemoryMigrationForMemoryObjectIsAlreadyInProgressThenDoEarlyReturn) {
DebugManagerStateRestore restorer;
DebugManager.flags.AllocateBuffersInLocalMemoryForMultiRootDeviceContexts.set(1);
std::unique_ptr<Buffer> pBuffer(BufferHelper<>::create(&context));
ASSERT_TRUE(pBuffer->getMultiGraphicsAllocation().requiresMigrations());
auto migrationSyncData = static_cast<MockMigrationSyncData *>(pBuffer->getMultiGraphicsAllocation().getMigrationSyncData());
migrationSyncData->startMigration();
EXPECT_TRUE(migrationSyncData->isMigrationInProgress());
MigrationController::migrateMemory(context, *memoryManager, pBuffer.get(), pCsr1->getRootDeviceIndex());
EXPECT_TRUE(migrationSyncData->isMigrationInProgress());
}

View File

@ -345,6 +345,7 @@ DECLARE_DEBUG_VARIABLE(bool, EnableAsyncEventsHandler, true, "Enables async even
DECLARE_DEBUG_VARIABLE(bool, EnableForcePin, true, "Enables early pinning for memory object")
DECLARE_DEBUG_VARIABLE(bool, EnableComputeWorkSizeND, true, "Enables different algorithm to compute local work size")
DECLARE_DEBUG_VARIABLE(bool, EnableMultiRootDeviceContexts, true, "Enables support for multi root device contexts")
DECLARE_DEBUG_VARIABLE(bool, AllocateBuffersInLocalMemoryForMultiRootDeviceContexts, false, "Enables support for buffers in local memory for multi root device contexts")
DECLARE_DEBUG_VARIABLE(bool, EnableComputeWorkSizeSquared, false, "Enables algorithm to compute the most squared work group as possible")
DECLARE_DEBUG_VARIABLE(bool, EnableExtendedVaFormats, false, "Enable more formats in cl-va sharing")
DECLARE_DEBUG_VARIABLE(bool, EnableFormatQuery, true, "Enable sharing format querying")

View File

@ -39,5 +39,9 @@ class MemoryPropertiesHelper {
static GraphicsAllocation::UsmInitialPlacement getUSMInitialPlacement(const MemoryProperties &memoryProperties);
static void setUSMInitialPlacement(AllocationProperties &allocationProperties, GraphicsAllocation::UsmInitialPlacement initialPlacement);
static bool useSystemMemoryForCrossRootDeviceAccess(bool multiRootDevice);
static bool useMultiStorageForCrossRootDeviceAccess(bool multiRootDevice);
};
} // namespace NEO

View File

@ -60,4 +60,12 @@ void MemoryPropertiesHelper::setUSMInitialPlacement(AllocationProperties &alloca
allocationProperties.usmInitialPlacement = initialPlacement;
}
bool MemoryPropertiesHelper::useSystemMemoryForCrossRootDeviceAccess(bool multiRootDevice) {
return multiRootDevice && !DebugManager.flags.AllocateBuffersInLocalMemoryForMultiRootDeviceContexts.get();
}
bool MemoryPropertiesHelper::useMultiStorageForCrossRootDeviceAccess(bool multiRootDevice) {
return multiRootDevice && DebugManager.flags.AllocateBuffersInLocalMemoryForMultiRootDeviceContexts.get();
}
} // namespace NEO

View File

@ -20,6 +20,7 @@
#include "shared/source/helpers/heap_assigner.h"
#include "shared/source/helpers/hw_helper.h"
#include "shared/source/helpers/hw_info.h"
#include "shared/source/helpers/memory_properties_helpers.h"
#include "shared/source/helpers/string.h"
#include "shared/source/helpers/surface_format_info.h"
#include "shared/source/memory_manager/compression_selector.h"
@ -460,7 +461,7 @@ bool MemoryManager::getAllocationData(AllocationData &allocationData, const Allo
allocationData.useMmapObject = properties.useMmapObject;
allocationData.flags.crossRootDeviceAccess = properties.flags.crossRootDeviceAccess;
allocationData.flags.useSystemMemory |= properties.flags.crossRootDeviceAccess;
allocationData.flags.useSystemMemory |= MemoryPropertiesHelper::useSystemMemoryForCrossRootDeviceAccess(properties.flags.crossRootDeviceAccess);
helper.setExtraAllocationData(allocationData, properties, hwInfo);
allocationData.flags.useSystemMemory |= properties.flags.forceSystemMemory;

View File

@ -21,7 +21,7 @@ class MigrationSyncData : public ReferenceTrackedObject<MigrationSyncData> {
uint32_t getCurrentLocation() const;
void startMigration();
void setCurrentLocation(uint32_t rootDeviceIndex);
void signalUsage(volatile uint32_t *tagAddress, uint32_t taskCount);
MOCKABLE_VIRTUAL void signalUsage(volatile uint32_t *tagAddress, uint32_t taskCount);
bool isUsedByTheSameContext(volatile uint32_t *tagAddress) const;
MOCKABLE_VIRTUAL void waitOnCpu();
bool isMigrationInProgress() const { return migrationInProgress; }

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2021 Intel Corporation
* Copyright (C) 2021-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@ -21,11 +21,16 @@ struct MockMigrationSyncData : public MigrationSyncData {
using MigrationSyncData::latestTaskCountUsed;
using MigrationSyncData::MigrationSyncData;
using MigrationSyncData::tagAddress;
void signalUsage(volatile uint32_t *tagAddress, uint32_t taskCount) override {
signalUsageCalled++;
MigrationSyncData::signalUsage(tagAddress, taskCount);
}
void waitOnCpu() override {
waitOnCpuCalled++;
MigrationSyncData::waitOnCpu();
}
uint32_t signalUsageCalled = 0u;
uint32_t waitOnCpuCalled = 0u;
};

View File

@ -138,6 +138,7 @@ EnableGemCloseWorker = -1
EnableHostPtrValidation = -1
EnableComputeWorkSizeND = 1
EnableMultiRootDeviceContexts = 1
AllocateBuffersInLocalMemoryForMultiRootDeviceContexts = 0
EnableComputeWorkSizeSquared = 0
EnableVaLibCalls = -1
EnableExtendedVaFormats = 0