mirror of
https://github.com/intel/compute-runtime.git
synced 2025-09-10 12:53:42 +08:00
Allocate buffers in local memory for PVC multi root device platforms (1/n)
PVC platform with no support for atomic operations on system memory must always allocate buffers in local memory to avoid atomic access violation. Note: the feature is being implemented under the new registry key AllocateBuffersInLocalMemoryForMultiRootDeviceContexts (disabled by default) Related-To: NEO-7092 Signed-off-by: Milczarek, Slawomir <slawomir.milczarek@intel.com>
This commit is contained in:

committed by
Compute-Runtime-Automation

parent
b0c97e49ea
commit
25a5ed0dca
@ -981,6 +981,9 @@ bool CommandQueue::queueDependenciesClearRequired() const {
|
||||
|
||||
bool CommandQueue::blitEnqueueAllowed(const CsrSelectionArgs &args) const {
|
||||
bool blitEnqueueAllowed = getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled() || this->isCopyOnly;
|
||||
if (this->getContext().getRootDeviceIndices().size() > 1) {
|
||||
blitEnqueueAllowed &= !DebugManager.flags.AllocateBuffersInLocalMemoryForMultiRootDeviceContexts.get();
|
||||
}
|
||||
if (DebugManager.flags.EnableBlitterForEnqueueOperations.get() != -1) {
|
||||
blitEnqueueAllowed = DebugManager.flags.EnableBlitterForEnqueueOperations.get();
|
||||
}
|
||||
|
@ -440,6 +440,8 @@ Buffer *Buffer::create(Context *context,
|
||||
}
|
||||
}
|
||||
|
||||
multiGraphicsAllocation.setMultiStorage(MemoryPropertiesHelper::useMultiStorageForCrossRootDeviceAccess(context->getRootDeviceIndices().size() > 1));
|
||||
|
||||
auto rootDeviceIndex = context->getDevice(0u)->getRootDeviceIndex();
|
||||
auto &allocationInfo = allocationInfos[rootDeviceIndex];
|
||||
auto memoryStorage = multiGraphicsAllocation.getDefaultGraphicsAllocation()->getUnderlyingBuffer();
|
||||
|
@ -27,12 +27,17 @@ void MigrationController::handleMigration(Context &context, CommandStreamReceive
|
||||
if (migrationSyncData->getCurrentLocation() != targetRootDeviceIndex) {
|
||||
migrateMemory(context, *memoryManager, memObj, targetRootDeviceIndex);
|
||||
}
|
||||
migrationSyncData->signalUsage(targetCsr.getTagAddress(), targetCsr.peekTaskCount() + 1);
|
||||
if (!context.getSpecialQueue(targetRootDeviceIndex)->isWaitForTimestampsEnabled()) {
|
||||
migrationSyncData->signalUsage(targetCsr.getTagAddress(), targetCsr.peekTaskCount() + 1);
|
||||
}
|
||||
}
|
||||
|
||||
void MigrationController::migrateMemory(Context &context, MemoryManager &memoryManager, MemObj *memObj, uint32_t targetRootDeviceIndex) {
|
||||
auto &multiGraphicsAllocation = memObj->getMultiGraphicsAllocation();
|
||||
auto migrationSyncData = multiGraphicsAllocation.getMigrationSyncData();
|
||||
if (migrationSyncData->isMigrationInProgress()) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto sourceRootDeviceIndex = migrationSyncData->getCurrentLocation();
|
||||
if (sourceRootDeviceIndex == std::numeric_limits<uint32_t>::max()) {
|
||||
|
@ -1689,6 +1689,20 @@ TEST(CommandQueue, givenImageToBufferClCommandWhenCallingBlitEnqueueAllowedThenR
|
||||
EXPECT_FALSE(queue.blitEnqueueAllowed(args));
|
||||
}
|
||||
|
||||
TEST(CommandQueue, givenAllocateBuffersInLocalMemoryForMultiRootDeviceContextsWhenMultiRootDeviceContextIsCreatedThenWhenBlitEnqueueIsNotAllowed) {
|
||||
DebugManagerStateRestore restorer;
|
||||
DebugManager.flags.AllocateBuffersInLocalMemoryForMultiRootDeviceContexts.set(1);
|
||||
|
||||
MockDefaultContext context{true};
|
||||
MockCommandQueue queue(&context, context.getDevice(0), 0, false);
|
||||
MockGraphicsAllocation alloc{};
|
||||
|
||||
ASSERT_TRUE(context.getRootDeviceIndices().size() > 1);
|
||||
|
||||
CsrSelectionArgs args{CL_COMMAND_READ_BUFFER, &alloc, &alloc, 0u, nullptr};
|
||||
EXPECT_FALSE(queue.blitEnqueueAllowed(args));
|
||||
}
|
||||
|
||||
template <bool blitter, bool selectBlitterWithQueueFamilies>
|
||||
struct CsrSelectionCommandQueueTests : ::testing::Test {
|
||||
void SetUp() override {
|
||||
|
@ -219,7 +219,7 @@ HWTEST_F(EnqueueReadImageTest, givenGpuHangAndCommandQueueAndPtrCopyAllowedForHo
|
||||
|
||||
HWTEST_F(EnqueueReadImageTest, givenMultiRootDeviceImageWhenEnqueueReadImageThenKernelRequiresMigration) {
|
||||
|
||||
MockDefaultContext context;
|
||||
MockDefaultContext context{true};
|
||||
|
||||
auto pCmdQ1 = createCommandQueue(context.getDevice(0), nullptr, &context);
|
||||
|
||||
@ -270,7 +270,7 @@ HWTEST_F(EnqueueReadImageTest, givenMultiRootDeviceImageWhenEnqueueReadImageThen
|
||||
|
||||
HWTEST_F(EnqueueReadImageTest, givenMultiRootDeviceImageWhenEnqueueReadImageIsCalledMultipleTimesThenEachKernelUsesDifferentImage) {
|
||||
|
||||
MockDefaultContext context;
|
||||
MockDefaultContext context{true};
|
||||
|
||||
auto pCmdQ1 = createCommandQueue(context.getDevice(0), nullptr, &context);
|
||||
|
||||
@ -352,7 +352,7 @@ HWTEST_F(EnqueueReadImageTest, givenMultiRootDeviceImageWhenEnqueueReadImageIsCa
|
||||
}
|
||||
|
||||
HWTEST_F(EnqueueReadImageTest, givenMultiRootDeviceImageWhenNonBlockedEnqueueReadImageIsCalledThenCommandQueueIsFlushed) {
|
||||
MockDefaultContext context;
|
||||
MockDefaultContext context{true};
|
||||
|
||||
auto pCmdQ1 = createCommandQueue(context.getDevice(0), nullptr, &context);
|
||||
|
||||
@ -374,7 +374,7 @@ HWTEST_F(EnqueueReadImageTest, givenMultiRootDeviceImageWhenNonBlockedEnqueueRea
|
||||
HWTEST_F(EnqueueReadImageTest, givenMultiRootDeviceImageWhenNonBlockedEnqueueReadImageIsCalledThenTlbCacheIsInvalidated) {
|
||||
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
|
||||
|
||||
MockDefaultContext context;
|
||||
MockDefaultContext context{true};
|
||||
|
||||
auto pCmdQ1 = createCommandQueue(context.getDevice(0), nullptr, &context);
|
||||
|
||||
@ -409,7 +409,7 @@ HWTEST_F(EnqueueReadImageTest, givenMultiRootDeviceImageWhenNonBlockedEnqueueRea
|
||||
|
||||
HWTEST_F(EnqueueReadImageTest, givenMultiRootDeviceImageWhenEnqueueReadImageIsCalledToDifferentDevicesThenCorrectLocationIsSet) {
|
||||
|
||||
MockDefaultContext context;
|
||||
MockDefaultContext context{true};
|
||||
|
||||
auto pCmdQ1 = createCommandQueue(context.getDevice(0), nullptr, &context);
|
||||
auto pCmdQ2 = createCommandQueue(context.getDevice(1), nullptr, &context);
|
||||
@ -473,7 +473,7 @@ HWTEST_F(EnqueueReadImageTest, givenMultiRootDeviceImageWhenEnqueueReadImageIsCa
|
||||
|
||||
HWTEST_F(EnqueueReadImageTest, givenImageFromBufferThatRequiresMigrationWhenEnqueueReadImageThenBufferObjectIsTakenForMigration) {
|
||||
|
||||
MockDefaultContext context;
|
||||
MockDefaultContext context{true};
|
||||
|
||||
auto pCmdQ1 = createCommandQueue(context.getDevice(0), nullptr, &context);
|
||||
|
||||
|
@ -590,7 +590,7 @@ HWTEST_F(EnqueueWriteImageTest, whenEnqueueWriteImageThenBuiltinKernelIsResolved
|
||||
|
||||
HWTEST_F(EnqueueWriteImageTest, givenMultiRootDeviceImageWhenEnqueueWriteImageThenKernelRequiresMigration) {
|
||||
|
||||
MockDefaultContext context;
|
||||
MockDefaultContext context{true};
|
||||
|
||||
auto pCmdQ1 = createCommandQueue(context.getDevice(0), nullptr, &context);
|
||||
|
||||
@ -641,7 +641,7 @@ HWTEST_F(EnqueueWriteImageTest, givenMultiRootDeviceImageWhenEnqueueWriteImageTh
|
||||
|
||||
HWTEST_F(EnqueueWriteImageTest, givenMultiRootDeviceImageWhenEnqueueWriteImageIsCalledMultipleTimesThenEachKernelUsesDifferentImage) {
|
||||
|
||||
MockDefaultContext context;
|
||||
MockDefaultContext context{true};
|
||||
|
||||
auto pCmdQ1 = createCommandQueue(context.getDevice(0), nullptr, &context);
|
||||
|
||||
@ -723,7 +723,7 @@ HWTEST_F(EnqueueWriteImageTest, givenMultiRootDeviceImageWhenEnqueueWriteImageIs
|
||||
}
|
||||
|
||||
HWTEST_F(EnqueueWriteImageTest, givenMultiRootDeviceImageWhenNonBlockedEnqueueWriteImageIsCalledThenCommandQueueIsFlushed) {
|
||||
MockDefaultContext context;
|
||||
MockDefaultContext context{true};
|
||||
|
||||
auto pCmdQ1 = createCommandQueue(context.getDevice(0), nullptr, &context);
|
||||
|
||||
@ -745,7 +745,7 @@ HWTEST_F(EnqueueWriteImageTest, givenMultiRootDeviceImageWhenNonBlockedEnqueueWr
|
||||
HWTEST_F(EnqueueWriteImageTest, givenMultiRootDeviceImageWhenNonBlockedEnqueueWriteImageIsCalledThenTlbCacheIsInvalidated) {
|
||||
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
|
||||
|
||||
MockDefaultContext context;
|
||||
MockDefaultContext context{true};
|
||||
|
||||
auto pCmdQ1 = createCommandQueue(context.getDevice(0), nullptr, &context);
|
||||
|
||||
|
@ -557,3 +557,43 @@ TEST_F(MemoryPropertiesHelperTests, givenSubDeviceIdWhenParsingExtraMemoryProper
|
||||
EXPECT_EQ(0b10u, memoryProperties.pDevice->getDeviceBitfield().to_ulong());
|
||||
EXPECT_EQ(&context.pSubDevice1->getDevice(), memoryProperties.pDevice);
|
||||
}
|
||||
|
||||
TEST_F(MemoryPropertiesHelperTests, whenQueryingUseSystemMemoryForCrossRootDeviceAccessThenReturnTrueForMultiRootDeviceContexts) {
|
||||
for (auto multiRootDevice : {false, true}) {
|
||||
EXPECT_EQ(multiRootDevice, MemoryPropertiesHelper::useSystemMemoryForCrossRootDeviceAccess(multiRootDevice));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(MemoryPropertiesHelperTests, givenAllocateBuffersInLocalMemoryForMultiRootDeviceContextsWhenQueryingUseSystemMemoryForCrossRootDeviceAccessThenReturnFalseForMultiRootDeviceContexts) {
|
||||
DebugManagerStateRestore restore;
|
||||
|
||||
for (auto localMemory : {false, true}) {
|
||||
DebugManager.flags.AllocateBuffersInLocalMemoryForMultiRootDeviceContexts.set(localMemory);
|
||||
EXPECT_FALSE(MemoryPropertiesHelper::useSystemMemoryForCrossRootDeviceAccess(false));
|
||||
}
|
||||
|
||||
for (auto localMemory : {false, true}) {
|
||||
DebugManager.flags.AllocateBuffersInLocalMemoryForMultiRootDeviceContexts.set(localMemory);
|
||||
EXPECT_NE(localMemory, MemoryPropertiesHelper::useSystemMemoryForCrossRootDeviceAccess(true));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(MemoryPropertiesHelperTests, whenQueryingUseMultiStorageForCrossRootDeviceAccessThenReturnFalseForMultiRootDeviceContexts) {
|
||||
for (auto multiRootDevice : {false, true}) {
|
||||
EXPECT_FALSE(MemoryPropertiesHelper::useMultiStorageForCrossRootDeviceAccess(multiRootDevice));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(MemoryPropertiesHelperTests, givenAllocateBuffersInLocalMemoryForMultiRootDeviceContextsWhenQueryingUseMultiStorageForCrossRootDeviceAccessThenReturnTrueForMultiRootDeviceContexts) {
|
||||
DebugManagerStateRestore restore;
|
||||
|
||||
for (auto localMemory : {false, true}) {
|
||||
DebugManager.flags.AllocateBuffersInLocalMemoryForMultiRootDeviceContexts.set(localMemory);
|
||||
EXPECT_FALSE(MemoryPropertiesHelper::useMultiStorageForCrossRootDeviceAccess(false));
|
||||
}
|
||||
|
||||
for (auto localMemory : {false, true}) {
|
||||
DebugManager.flags.AllocateBuffersInLocalMemoryForMultiRootDeviceContexts.set(localMemory);
|
||||
EXPECT_EQ(localMemory, MemoryPropertiesHelper::useMultiStorageForCrossRootDeviceAccess(true));
|
||||
}
|
||||
}
|
||||
|
@ -213,3 +213,42 @@ TEST_F(MigrationControllerTests, whenHandleMigrationThenProperTagAddressAndTaskC
|
||||
EXPECT_EQ(pCsr0->getTagAddress(), migrationSyncData->tagAddress);
|
||||
EXPECT_EQ(pCsr0->peekTaskCount() + 1, migrationSyncData->latestTaskCountUsed);
|
||||
}
|
||||
|
||||
TEST_F(MigrationControllerTests, givenWaitForTimestampsEnabledWhenHandleMigrationIsCalledThenDontSignalTaskCountBasedUsage) {
|
||||
DebugManagerStateRestore restorer;
|
||||
DebugManager.flags.EnableTimestampWaitForQueues.set(4);
|
||||
|
||||
VariableBackup<decltype(MultiGraphicsAllocation::createMigrationSyncDataFunc)> createFuncBackup{&MultiGraphicsAllocation::createMigrationSyncDataFunc};
|
||||
MultiGraphicsAllocation::createMigrationSyncDataFunc = [](size_t size) -> MigrationSyncData * {
|
||||
return new MockMigrationSyncData(size);
|
||||
};
|
||||
|
||||
std::unique_ptr<Buffer> pBuffer(BufferHelper<>::create(&context));
|
||||
const_cast<MultiGraphicsAllocation &>(pBuffer->getMultiGraphicsAllocation()).setMultiStorage(true);
|
||||
|
||||
ASSERT_TRUE(pBuffer->getMultiGraphicsAllocation().requiresMigrations());
|
||||
|
||||
auto migrationSyncData = static_cast<MockMigrationSyncData *>(pBuffer->getMultiGraphicsAllocation().getMigrationSyncData());
|
||||
|
||||
MigrationController::handleMigration(context, *pCsr0, pBuffer.get());
|
||||
|
||||
EXPECT_EQ(0u, migrationSyncData->signalUsageCalled);
|
||||
}
|
||||
|
||||
TEST_F(MigrationControllerTests, whenMemoryMigrationForMemoryObjectIsAlreadyInProgressThenDoEarlyReturn) {
|
||||
DebugManagerStateRestore restorer;
|
||||
DebugManager.flags.AllocateBuffersInLocalMemoryForMultiRootDeviceContexts.set(1);
|
||||
|
||||
std::unique_ptr<Buffer> pBuffer(BufferHelper<>::create(&context));
|
||||
|
||||
ASSERT_TRUE(pBuffer->getMultiGraphicsAllocation().requiresMigrations());
|
||||
|
||||
auto migrationSyncData = static_cast<MockMigrationSyncData *>(pBuffer->getMultiGraphicsAllocation().getMigrationSyncData());
|
||||
|
||||
migrationSyncData->startMigration();
|
||||
EXPECT_TRUE(migrationSyncData->isMigrationInProgress());
|
||||
|
||||
MigrationController::migrateMemory(context, *memoryManager, pBuffer.get(), pCsr1->getRootDeviceIndex());
|
||||
|
||||
EXPECT_TRUE(migrationSyncData->isMigrationInProgress());
|
||||
}
|
||||
|
@ -345,6 +345,7 @@ DECLARE_DEBUG_VARIABLE(bool, EnableAsyncEventsHandler, true, "Enables async even
|
||||
DECLARE_DEBUG_VARIABLE(bool, EnableForcePin, true, "Enables early pinning for memory object")
|
||||
DECLARE_DEBUG_VARIABLE(bool, EnableComputeWorkSizeND, true, "Enables different algorithm to compute local work size")
|
||||
DECLARE_DEBUG_VARIABLE(bool, EnableMultiRootDeviceContexts, true, "Enables support for multi root device contexts")
|
||||
DECLARE_DEBUG_VARIABLE(bool, AllocateBuffersInLocalMemoryForMultiRootDeviceContexts, false, "Enables support for buffers in local memory for multi root device contexts")
|
||||
DECLARE_DEBUG_VARIABLE(bool, EnableComputeWorkSizeSquared, false, "Enables algorithm to compute the most squared work group as possible")
|
||||
DECLARE_DEBUG_VARIABLE(bool, EnableExtendedVaFormats, false, "Enable more formats in cl-va sharing")
|
||||
DECLARE_DEBUG_VARIABLE(bool, EnableFormatQuery, true, "Enable sharing format querying")
|
||||
|
@ -39,5 +39,9 @@ class MemoryPropertiesHelper {
|
||||
static GraphicsAllocation::UsmInitialPlacement getUSMInitialPlacement(const MemoryProperties &memoryProperties);
|
||||
|
||||
static void setUSMInitialPlacement(AllocationProperties &allocationProperties, GraphicsAllocation::UsmInitialPlacement initialPlacement);
|
||||
|
||||
static bool useSystemMemoryForCrossRootDeviceAccess(bool multiRootDevice);
|
||||
|
||||
static bool useMultiStorageForCrossRootDeviceAccess(bool multiRootDevice);
|
||||
};
|
||||
} // namespace NEO
|
||||
|
@ -60,4 +60,12 @@ void MemoryPropertiesHelper::setUSMInitialPlacement(AllocationProperties &alloca
|
||||
allocationProperties.usmInitialPlacement = initialPlacement;
|
||||
}
|
||||
|
||||
bool MemoryPropertiesHelper::useSystemMemoryForCrossRootDeviceAccess(bool multiRootDevice) {
|
||||
return multiRootDevice && !DebugManager.flags.AllocateBuffersInLocalMemoryForMultiRootDeviceContexts.get();
|
||||
}
|
||||
|
||||
bool MemoryPropertiesHelper::useMultiStorageForCrossRootDeviceAccess(bool multiRootDevice) {
|
||||
return multiRootDevice && DebugManager.flags.AllocateBuffersInLocalMemoryForMultiRootDeviceContexts.get();
|
||||
}
|
||||
|
||||
} // namespace NEO
|
||||
|
@ -20,6 +20,7 @@
|
||||
#include "shared/source/helpers/heap_assigner.h"
|
||||
#include "shared/source/helpers/hw_helper.h"
|
||||
#include "shared/source/helpers/hw_info.h"
|
||||
#include "shared/source/helpers/memory_properties_helpers.h"
|
||||
#include "shared/source/helpers/string.h"
|
||||
#include "shared/source/helpers/surface_format_info.h"
|
||||
#include "shared/source/memory_manager/compression_selector.h"
|
||||
@ -460,7 +461,7 @@ bool MemoryManager::getAllocationData(AllocationData &allocationData, const Allo
|
||||
allocationData.useMmapObject = properties.useMmapObject;
|
||||
|
||||
allocationData.flags.crossRootDeviceAccess = properties.flags.crossRootDeviceAccess;
|
||||
allocationData.flags.useSystemMemory |= properties.flags.crossRootDeviceAccess;
|
||||
allocationData.flags.useSystemMemory |= MemoryPropertiesHelper::useSystemMemoryForCrossRootDeviceAccess(properties.flags.crossRootDeviceAccess);
|
||||
|
||||
helper.setExtraAllocationData(allocationData, properties, hwInfo);
|
||||
allocationData.flags.useSystemMemory |= properties.flags.forceSystemMemory;
|
||||
|
@ -21,7 +21,7 @@ class MigrationSyncData : public ReferenceTrackedObject<MigrationSyncData> {
|
||||
uint32_t getCurrentLocation() const;
|
||||
void startMigration();
|
||||
void setCurrentLocation(uint32_t rootDeviceIndex);
|
||||
void signalUsage(volatile uint32_t *tagAddress, uint32_t taskCount);
|
||||
MOCKABLE_VIRTUAL void signalUsage(volatile uint32_t *tagAddress, uint32_t taskCount);
|
||||
bool isUsedByTheSameContext(volatile uint32_t *tagAddress) const;
|
||||
MOCKABLE_VIRTUAL void waitOnCpu();
|
||||
bool isMigrationInProgress() const { return migrationInProgress; }
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2021 Intel Corporation
|
||||
* Copyright (C) 2021-2022 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@ -21,11 +21,16 @@ struct MockMigrationSyncData : public MigrationSyncData {
|
||||
using MigrationSyncData::latestTaskCountUsed;
|
||||
using MigrationSyncData::MigrationSyncData;
|
||||
using MigrationSyncData::tagAddress;
|
||||
void signalUsage(volatile uint32_t *tagAddress, uint32_t taskCount) override {
|
||||
signalUsageCalled++;
|
||||
MigrationSyncData::signalUsage(tagAddress, taskCount);
|
||||
}
|
||||
void waitOnCpu() override {
|
||||
waitOnCpuCalled++;
|
||||
MigrationSyncData::waitOnCpu();
|
||||
}
|
||||
|
||||
uint32_t signalUsageCalled = 0u;
|
||||
uint32_t waitOnCpuCalled = 0u;
|
||||
};
|
||||
|
||||
|
@ -138,6 +138,7 @@ EnableGemCloseWorker = -1
|
||||
EnableHostPtrValidation = -1
|
||||
EnableComputeWorkSizeND = 1
|
||||
EnableMultiRootDeviceContexts = 1
|
||||
AllocateBuffersInLocalMemoryForMultiRootDeviceContexts = 0
|
||||
EnableComputeWorkSizeSquared = 0
|
||||
EnableVaLibCalls = -1
|
||||
EnableExtendedVaFormats = 0
|
||||
|
Reference in New Issue
Block a user