fix: remove doubled operations when executing regular on immediate

- immediate flush does not need to perform below for command list execution
- handling indirect allocations
- performing page fault migration
- making residency of unified shared memory
- perform prefetching when required

Related-To: NEO-10356

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz
2025-04-09 08:58:37 +00:00
committed by Compute-Runtime-Automation
parent 18f752ee2f
commit b31c3bb3ca
7 changed files with 265 additions and 24 deletions

View File

@@ -451,6 +451,8 @@ inline ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::executeCommand
auto csr = cmdQImp->getCsr();
auto lockCSR = outerLock != nullptr ? std::move(*outerLock) : csr->obtainUniqueOwnership();
std::unique_lock<std::mutex> lockForIndirect;
if (appendOperation != NEO::AppendOperations::cmdList) {
if (NEO::ApiSpecificConfig::isSharedAllocPrefetchEnabled()) {
auto svmAllocMgr = this->device->getDriverHandle()->getSvmAllocsManager();
@@ -458,35 +460,34 @@ inline ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::executeCommand
}
cmdQ->registerCsrClient();
}
std::unique_lock<std::mutex> lockForIndirect;
if (this->hasIndirectAllocationsAllowed()) {
cmdQ->handleIndirectAllocationResidency(this->getUnifiedMemoryControls(), lockForIndirect, performMigration);
}
if (performMigration) {
auto deviceImp = static_cast<DeviceImp *>(this->device);
auto pageFaultManager = deviceImp->getDriverHandle()->getMemoryManager()->getPageFaultManager();
if (pageFaultManager == nullptr) {
performMigration = false;
if (this->hasIndirectAllocationsAllowed()) {
cmdQ->handleIndirectAllocationResidency(this->getUnifiedMemoryControls(), lockForIndirect, performMigration);
}
}
cmdQ->makeResidentAndMigrate(performMigration, this->commandContainer.getResidencyContainer());
if (performMigration) {
auto deviceImp = static_cast<DeviceImp *>(this->device);
auto pageFaultManager = deviceImp->getDriverHandle()->getMemoryManager()->getPageFaultManager();
if (pageFaultManager == nullptr) {
performMigration = false;
}
}
static_cast<CommandQueueHw<gfxCoreFamily> *>(this->cmdQImmediate)->patchCommands(*this, 0u, false);
cmdQ->makeResidentAndMigrate(performMigration, this->commandContainer.getResidencyContainer());
if (performMigration) {
this->migrateSharedAllocations();
}
if (performMigration) {
this->migrateSharedAllocations();
}
if (this->performMemoryPrefetch) {
auto prefetchManager = this->device->getDriverHandle()->getMemoryManager()->getPrefetchManager();
prefetchManager->migrateAllocationsToGpu(this->getPrefetchContext(),
*this->device->getDriverHandle()->getSvmAllocsManager(),
*this->device->getNEODevice(),
*csr);
if (this->performMemoryPrefetch) {
auto prefetchManager = this->device->getDriverHandle()->getMemoryManager()->getPrefetchManager();
prefetchManager->migrateAllocationsToGpu(this->getPrefetchContext(),
*this->device->getDriverHandle()->getSvmAllocsManager(),
*this->device->getNEODevice(),
*csr);
}
static_cast<CommandQueueHw<gfxCoreFamily> *>(this->cmdQImmediate)->patchCommands(*this, 0u, false);
}
NEO::CompletionStamp completionStamp;

View File

@@ -144,6 +144,11 @@ struct MockCommandQueueHw : public L0::CommandQueueHw<gfxCoreFamily> {
return returnCode;
}
void handleIndirectAllocationResidency(UnifiedMemoryControls unifiedMemoryControls, std::unique_lock<std::mutex> &lockForIndirect, bool performMigration) override {
handleIndirectAllocationResidencyCalledTimes++;
BaseClass::handleIndirectAllocationResidency(unifiedMemoryControls, lockForIndirect, performMigration);
}
NEO::GraphicsAllocation *recordedGlobalStatelessAllocation = nullptr;
NEO::ScratchSpaceController *recordedScratchController = nullptr;
uint32_t synchronizedCalled = 0;
@@ -151,6 +156,7 @@ struct MockCommandQueueHw : public L0::CommandQueueHw<gfxCoreFamily> {
ze_result_t synchronizeReturnValue{ZE_RESULT_SUCCESS};
std::optional<NEO::WaitStatus> reserveLinearStreamSizeReturnValue{};
std::optional<NEO::SubmissionStatus> submitBatchBufferReturnValue{};
uint32_t handleIndirectAllocationResidencyCalledTimes = 0;
bool recordedLockScratchController = false;
};

View File

@@ -34,6 +34,7 @@
#include "level_zero/core/test/unit_tests/mocks/mock_event.h"
#include "level_zero/core/test/unit_tests/mocks/mock_image.h"
#include "level_zero/core/test/unit_tests/mocks/mock_kernel.h"
#include "level_zero/core/test/unit_tests/sources/helper/ze_object_utils.h"
#include "test_traits_common.h"
@@ -1281,6 +1282,39 @@ HWTEST2_F(CommandListTest, givenCmdListWithIndirectAccessWhenExecutingCommandLis
commandListImmediate.cmdQImmediate = oldCommandQueue;
}
HWTEST2_F(CommandListTest, givenRegularCmdListWithIndirectAccessWhenExecutingRegularOnImmediateCommandListThenHandleIndirectAccessCalled, MatchAny) {
ze_command_queue_desc_t desc = {};
desc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS;
ze_result_t returnValue;
std::unique_ptr<L0::CommandList> commandListImmediate(CommandList::createImmediate(productFamily, device, &desc, false, NEO::EngineGroupType::compute, returnValue));
auto &mockCommandListImmediate = static_cast<MockCommandListImmediate<FamilyType::gfxCoreFamily> &>(*commandListImmediate);
MockCommandStreamReceiver mockCommandStreamReceiver(*neoDevice->executionEnvironment, neoDevice->getRootDeviceIndex(), neoDevice->getDeviceBitfield());
auto mockCmdQHw = makeZeUniquePtr<MockCommandQueueHw<FamilyType::gfxCoreFamily>>(device, device->getNEODevice()->getDefaultEngine().commandStreamReceiver, &desc);
mockCmdQHw->initialize(false, false, true);
auto oldCommandQueue = mockCommandListImmediate.cmdQImmediate;
mockCommandListImmediate.cmdQImmediate = mockCmdQHw.get();
std::unique_ptr<L0::CommandList> commandListRegular(CommandList::create(productFamily,
device,
NEO::EngineGroupType::compute,
0u,
returnValue, false));
ASSERT_NE(nullptr, commandListRegular);
auto &mockCommandListRegular = static_cast<CommandListCoreFamily<FamilyType::gfxCoreFamily> &>(*commandListRegular);
mockCommandListRegular.indirectAllocationsAllowed = true;
commandListRegular->close();
auto cmdListHandle = commandListRegular->toHandle();
returnValue = commandListImmediate->appendCommandLists(1, &cmdListHandle, nullptr, 0, nullptr);
ASSERT_EQ(ZE_RESULT_SUCCESS, returnValue);
EXPECT_EQ(mockCmdQHw->handleIndirectAllocationResidencyCalledTimes, 1u);
mockCommandListImmediate.cmdQImmediate = oldCommandQueue;
}
HWTEST2_F(CommandListTest, givenCmdListWithNoIndirectAccessWhenExecutingCommandListImmediateWithFlushTaskThenHandleIndirectAccessNotCalled, MatchAny) {
ze_command_queue_desc_t desc = {};
desc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS;

View File

@@ -581,6 +581,67 @@ HWTEST_F(CommandQueueIndirectAllocations, givenDeviceThatSupportsSubmittingIndir
device->getDriverHandle()->getSvmAllocsManager()->freeSVMAlloc(deviceAlloc);
}
HWTEST_F(CommandQueueIndirectAllocations,
givenDeviceThatSupportsSubmittingIndirectAllocationsAsPackWhenIndirectAccessIsUsedWhenExecutingRegularOnImmediateThenWholePackIsMadeResidentWithRegularCommandList) {
DebugManagerStateRestore restorer;
NEO::debugManager.flags.EnableFlushTaskSubmission.set(true);
MockCsrHw2<FamilyType> csr(*neoDevice->getExecutionEnvironment(), 0, neoDevice->getDeviceBitfield());
csr.initializeTagAllocation();
csr.setupContext(*neoDevice->getDefaultEngine().osContext);
ze_result_t returnValue;
ze_command_queue_desc_t desc = {};
desc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
std::unique_ptr<L0::CommandList> commandListImmediate(CommandList::createImmediate(productFamily,
device,
&desc,
false,
NEO::EngineGroupType::compute,
returnValue));
ASSERT_NE(nullptr, commandListImmediate);
std::unique_ptr<L0::CommandList> commandListRegular(CommandList::create(productFamily,
device,
NEO::EngineGroupType::compute,
0u,
returnValue, false));
ASSERT_NE(nullptr, commandListRegular);
void *deviceAlloc = nullptr;
ze_device_mem_alloc_desc_t deviceDesc = {};
auto result = context->allocDeviceMem(device->toHandle(), &deviceDesc, 16384u, 4096u, &deviceAlloc);
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
auto gpuAlloc = device->getDriverHandle()->getSvmAllocsManager()->getSVMAllocs()->get(deviceAlloc)->gpuAllocations.getGraphicsAllocation(device->getRootDeviceIndex());
ASSERT_NE(nullptr, gpuAlloc);
createKernel();
kernel->unifiedMemoryControls.indirectDeviceAllocationsAllowed = true;
kernel->kernelHasIndirectAccess = true;
EXPECT_TRUE(kernel->getUnifiedMemoryControls().indirectDeviceAllocationsAllowed);
static_cast<MockMemoryManager *>(driverHandle->getMemoryManager())->overrideAllocateAsPackReturn = 1u;
ze_group_count_t groupCount{1, 1, 1};
CmdListKernelLaunchParams launchParams = {};
result = commandListRegular->appendLaunchKernel(kernel->toHandle(),
groupCount,
nullptr, 0, nullptr,
launchParams, false);
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
commandListRegular->close();
auto cmdListHandle = commandListRegular->toHandle();
result = commandListImmediate->appendCommandLists(1, &cmdListHandle, nullptr, 0, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_TRUE(gpuAlloc->isResident(csr.getOsContext().getContextId()));
EXPECT_EQ(GraphicsAllocation::objectAlwaysResident, gpuAlloc->getResidencyTaskCount(csr.getOsContext().getContextId()));
device->getDriverHandle()->getSvmAllocsManager()->freeSVMAlloc(deviceAlloc);
}
HWTEST_F(CommandQueueIndirectAllocations, givenImmediateCommandListAndFlushTaskWithIndirectAllocsAsPackDisabledThenLaunchKernelWorks) {
DebugManagerStateRestore restorer;
NEO::debugManager.flags.EnableFlushTaskSubmission.set(true);

View File

@@ -956,6 +956,70 @@ HWTEST_F(ContextMakeMemoryResidentAndMigrationTests,
context->freeMem(dstBuffer);
}
HWTEST_F(ContextMakeMemoryResidentAndMigrationTests,
GivenImmediateCommandListWhenExecutingRegularCommandListsHavingSharedAllocationWithMigrationOnImmediateThenMemoryFromMakeResidentIsMovedToGpuOnce) {
DriverHandleImp *driverHandleImp = static_cast<DriverHandleImp *>(hostDriverHandle.get());
size_t previousSize = driverHandleImp->sharedMakeResidentAllocations.size();
mockMemoryInterface->makeResidentResult = NEO::MemoryOperationsStatus::success;
ze_result_t res = context->makeMemoryResident(device, ptr, size);
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
size_t currentSize = driverHandleImp->sharedMakeResidentAllocations.size();
EXPECT_EQ(previousSize + 1, currentSize);
const ze_command_queue_desc_t desc = {};
MockCsrHw2<FamilyType> csr(*neoDevice->getExecutionEnvironment(), 0, neoDevice->getDeviceBitfield());
csr.initializeTagAllocation();
csr.setupContext(*neoDevice->getDefaultEngine().osContext);
ze_result_t result = ZE_RESULT_SUCCESS;
DebugManagerStateRestore restorer;
NEO::debugManager.flags.EnableFlushTaskSubmission.set(true);
std::unique_ptr<L0::CommandList> commandListImmediate(CommandList::createImmediate(productFamily,
device,
&desc,
false,
NEO::EngineGroupType::compute,
result));
ASSERT_NE(nullptr, commandListImmediate);
void *dstBuffer = nullptr;
ze_device_mem_alloc_desc_t deviceDesc = {};
ze_host_mem_alloc_desc_t hostDesc = {};
result = context->allocSharedMem(device->toHandle(), &deviceDesc, &hostDesc, 16384u, 4090u, &dstBuffer);
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
std::unique_ptr<L0::CommandList> commandListRegular(CommandList::create(productFamily,
device,
NEO::EngineGroupType::compute,
0u,
result, false));
int one = 1;
result = commandListRegular->appendMemoryFill(dstBuffer, reinterpret_cast<void *>(&one), sizeof(one), 4090u,
nullptr, 0, nullptr, false);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
commandListRegular->close();
auto cmdListHandle = commandListRegular->toHandle();
result = commandListImmediate->appendCommandLists(1, &cmdListHandle, nullptr, 0, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(mockPageFaultManager->moveAllocationToGpuDomainCalledTimes, 1u);
EXPECT_EQ(mockPageFaultManager->migratedAddress, ptr);
mockMemoryInterface->evictResult = NEO::MemoryOperationsStatus::success;
res = context->evictMemory(device, ptr, size);
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
context->freeMem(ptr);
context->freeMem(dstBuffer);
}
HWTEST_F(ContextMakeMemoryResidentAndMigrationTests,
whenExecutingImmediateCommandListsHavingHostAllocationWithMigrationThenMemoryFromMakeResidentIsMovedToGpu) {
DriverHandleImp *driverHandleImp = static_cast<DriverHandleImp *>(hostDriverHandle.get());

View File

@@ -646,6 +646,79 @@ HWTEST2_F(CommandListStatePrefetchXeHpcCore, givenAppendMemoryPrefetchForKmdMigr
commandQueue->destroy();
}
HWTEST2_F(CommandListStatePrefetchXeHpcCore, givenAppendMemoryPrefetchForKmdMigratedSharedAllocationsSetOnRegularCmdListWhenPrefetchApiIsCalledOnUnifiedSharedMemoryAndRegularExecutedOnImmediateThenCallMigrateAllocationsToGpuOnce, IsXeHpcCore) {
DebugManagerStateRestore restore;
debugManager.flags.UseKmdMigration.set(1);
neoDevice->deviceBitfield = 0b1000;
auto memoryManager = static_cast<MockMemoryManager *>(device->getDriverHandle()->getMemoryManager());
memoryManager->prefetchManager.reset(new MockPrefetchManager());
createKernel();
ze_result_t returnValue;
ze_command_queue_desc_t queueDesc = {};
ze_command_list_handle_t commandListHandle = CommandList::create(productFamily, device, NEO::EngineGroupType::compute, 0u, returnValue, false)->toHandle();
auto commandList = CommandList::fromHandle(commandListHandle);
std::unique_ptr<L0::CommandList> commandListImmediate(CommandList::createImmediate(productFamily,
device,
&queueDesc,
false,
NEO::EngineGroupType::compute,
returnValue));
ze_event_pool_desc_t eventPoolDesc = {};
eventPoolDesc.count = 1;
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
ze_event_desc_t eventDesc = {};
eventDesc.index = 0;
auto eventPool = std::unique_ptr<EventPool>(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, returnValue));
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
auto event = std::unique_ptr<Event>(Event::create<typename FamilyType::TimestampPacketType>(eventPool.get(), &eventDesc, device));
size_t size = 10;
size_t alignment = 1u;
void *ptr = nullptr;
ze_device_mem_alloc_desc_t deviceDesc = {};
ze_host_mem_alloc_desc_t hostDesc = {};
auto result = context->allocSharedMem(device->toHandle(), &deviceDesc, &hostDesc, size, alignment, &ptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_NE(nullptr, ptr);
result = commandList->appendMemoryPrefetch(ptr, size);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto prefetchManager = static_cast<MockPrefetchManager *>(memoryManager->prefetchManager.get());
EXPECT_EQ(1u, commandList->getPrefetchContext().allocations.size());
ze_group_count_t groupCount{1, 1, 1};
CmdListKernelLaunchParams launchParams = {};
result = commandList->appendLaunchKernel(kernel->toHandle(), groupCount, event->toHandle(), 0, nullptr, launchParams, false);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
commandList->close();
result = commandListImmediate->appendCommandLists(1, &commandListHandle, nullptr, 0, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_TRUE(memoryManager->setMemPrefetchCalled);
EXPECT_EQ(3u, memoryManager->memPrefetchSubDeviceIds[0]);
EXPECT_TRUE(prefetchManager->migrateAllocationsToGpuCalled);
EXPECT_EQ(1u, prefetchManager->migrateAllocationsToGpuCalledCount);
EXPECT_EQ(1u, commandList->getPrefetchContext().allocations.size());
commandList->reset();
EXPECT_TRUE(prefetchManager->removeAllocationsCalled);
EXPECT_EQ(0u, commandList->getPrefetchContext().allocations.size());
context->freeMem(ptr);
commandList->destroy();
}
HWTEST2_F(CommandListStatePrefetchXeHpcCore, givenAppendMemoryPrefetchForKmdMigratedSharedAllocationsSetWhenPrefetchApiIsCalledForUnifiedSharedMemoryOnCmdListCopyOnlyThenCallMigrateAllocationsToGpu, IsXeHpcCore) {
DebugManagerStateRestore restore;
debugManager.flags.UseKmdMigration.set(1);

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2022 Intel Corporation
* Copyright (C) 2022-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -17,6 +17,7 @@ class MockPrefetchManager : public PrefetchManager {
void migrateAllocationsToGpu(PrefetchContext &prefetchContext, SVMAllocsManager &unifiedMemoryManager, Device &device, CommandStreamReceiver &csr) override {
PrefetchManager::migrateAllocationsToGpu(prefetchContext, unifiedMemoryManager, device, csr);
migrateAllocationsToGpuCalled = true;
migrateAllocationsToGpuCalledCount++;
}
void removeAllocations(PrefetchContext &prefetchContext) override {
@@ -24,6 +25,7 @@ class MockPrefetchManager : public PrefetchManager {
removeAllocationsCalled = true;
}
uint32_t migrateAllocationsToGpuCalledCount = 0;
bool migrateAllocationsToGpuCalled = false;
bool removeAllocationsCalled = false;
};