performance: Allow indirect allocs as pack on OpenCL

Related-To: NEO-11228

Signed-off-by: Szymon Morek <szymon.morek@intel.com>
This commit is contained in:
Szymon Morek
2024-05-06 13:39:36 +00:00
committed by Compute-Runtime-Automation
parent ece79ba238
commit e35b951a00
6 changed files with 126 additions and 15 deletions

View File

@@ -326,14 +326,9 @@ NEO::WaitStatus CommandQueueImp::CommandBufferManager::switchBuffers(NEO::Comman
void CommandQueueImp::handleIndirectAllocationResidency(UnifiedMemoryControls unifiedMemoryControls, std::unique_lock<std::mutex> &lockForIndirect, bool performMigration) {
NEO::Device *neoDevice = this->device->getNEODevice();
auto svmAllocsManager = this->device->getDriverHandle()->getSvmAllocsManager();
auto submitAsPack = this->device->getDriverHandle()->getMemoryManager()->allowIndirectAllocationsAsPack(neoDevice->getRootDeviceIndex());
if (NEO::debugManager.flags.MakeIndirectAllocationsResidentAsPack.get() != -1) {
submitAsPack = !!NEO::debugManager.flags.MakeIndirectAllocationsResidentAsPack.get();
}
auto submittedAsPack = svmAllocsManager->submitIndirectAllocationsAsPack(*(this->csr));
if (submitAsPack) {
svmAllocsManager->makeIndirectAllocationsResident(*(this->csr), this->csr->peekTaskCount() + 1u);
} else {
if (!submittedAsPack) {
lockForIndirect = this->device->getDriverHandle()->getSvmAllocsManager()->obtainOwnership();
NEO::ResidencyContainer residencyAllocations;
svmAllocsManager->addInternalAllocationsToResidencyContainer(neoDevice->getRootDeviceIndex(),

View File

@@ -1410,7 +1410,11 @@ void Kernel::makeResident(CommandStreamReceiver &commandStreamReceiver) {
if (getHasIndirectAccess() && (unifiedMemoryControls.indirectDeviceAllocationsAllowed ||
unifiedMemoryControls.indirectHostAllocationsAllowed ||
unifiedMemoryControls.indirectSharedAllocationsAllowed)) {
this->getContext().getSVMAllocsManager()->makeInternalAllocationsResident(commandStreamReceiver, unifiedMemoryControls.generateMask());
auto svmAllocsManager = this->getContext().getSVMAllocsManager();
auto submittedAsPack = svmAllocsManager->submitIndirectAllocationsAsPack(commandStreamReceiver);
if (!submittedAsPack) {
svmAllocsManager->makeInternalAllocationsResident(commandStreamReceiver, unifiedMemoryControls.generateMask());
}
}
}

View File

@@ -1414,7 +1414,7 @@ HWTEST_F(KernelResidencyTest, givenKernelWhenItUsesIndirectUnifiedMemoryDeviceAl
svmAllocationsManager->freeSVMAlloc(unifiedMemoryAllocation);
}
HWTEST_F(KernelResidencyTest, givenKernelUsingIndirectHostMemoryWhenMakeResidentIsCalledThenOnlyHostAllocationsAreMadeResident) {
HWTEST_F(KernelResidencyTest, givenKernelUsingIndirectHostMemoryWhenMakeResidentIsCalledThenAllAllocationsAreMadeResident) {
MockKernelWithInternals mockKernel(*this->pClDevice);
auto &commandStreamReceiver = this->pDevice->getUltCommandStreamReceiver<FamilyType>();
@@ -1430,14 +1430,13 @@ HWTEST_F(KernelResidencyTest, givenKernelUsingIndirectHostMemoryWhenMakeResident
mockKernel.mockKernel->setUnifiedMemoryProperty(CL_KERNEL_EXEC_INFO_INDIRECT_HOST_ACCESS_INTEL, true);
mockKernel.mockKernel->makeResident(this->pDevice->getGpgpuCommandStreamReceiver());
EXPECT_EQ(1u, commandStreamReceiver.getResidencyAllocations().size());
EXPECT_EQ(commandStreamReceiver.getResidencyAllocations()[0]->getGpuAddress(), castToUint64(unifiedHostMemoryAllocation));
EXPECT_EQ(2u, commandStreamReceiver.getResidencyAllocations().size());
svmAllocationsManager->freeSVMAlloc(unifiedDeviceMemoryAllocation);
svmAllocationsManager->freeSVMAlloc(unifiedHostMemoryAllocation);
}
HWTEST_F(KernelResidencyTest, givenKernelUsingIndirectSharedMemoryWhenMakeResidentIsCalledThenOnlySharedAllocationsAreMadeResident) {
HWTEST_F(KernelResidencyTest, givenKernelUsingIndirectSharedMemoryWhenMakeResidentIsCalledThenAllSharedAllocationsAreMadeResident) {
MockKernelWithInternals mockKernel(*this->pClDevice);
auto &commandStreamReceiver = this->pDevice->getUltCommandStreamReceiver<FamilyType>();
@@ -1452,8 +1451,7 @@ HWTEST_F(KernelResidencyTest, givenKernelUsingIndirectSharedMemoryWhenMakeReside
mockKernel.mockKernel->setUnifiedMemoryProperty(CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL, true);
mockKernel.mockKernel->makeResident(this->pDevice->getGpgpuCommandStreamReceiver());
EXPECT_EQ(1u, commandStreamReceiver.getResidencyAllocations().size());
EXPECT_EQ(commandStreamReceiver.getResidencyAllocations()[0]->getGpuAddress(), castToUint64(unifiedSharedMemoryAllocation));
EXPECT_EQ(2u, commandStreamReceiver.getResidencyAllocations().size());
svmAllocationsManager->freeSVMAlloc(unifiedSharedMemoryAllocation);
svmAllocationsManager->freeSVMAlloc(unifiedHostMemoryAllocation);
@@ -2465,6 +2463,74 @@ HWTEST_F(KernelResidencyTest, WhenMakingArgsResidentThenImageFromImageCheckIsCor
EXPECT_EQ(CommandStreamReceiver::SamplerCacheFlushState::samplerCacheFlushBefore, commandStreamReceiver.samplerCacheFlushRequired);
}
HWTEST_F(KernelResidencyTest, givenKernelWhenMakeResidentIsCalledThenIndirectAllocationsArePacked) {
auto pKernelInfo = std::make_unique<MockKernelInfo>();
pKernelInfo->kernelDescriptor.kernelAttributes.simdSize = 1;
MockProgram program(toClDeviceVector(*pClDevice));
MockContext ctx;
program.setContext(&ctx);
std::unique_ptr<MockKernel> kernel(new MockKernel(&program, *pKernelInfo, *pClDevice));
ASSERT_EQ(CL_SUCCESS, kernel->initialize());
kernel->setUnifiedMemoryProperty(CL_KERNEL_EXEC_INFO_INDIRECT_DEVICE_ACCESS_INTEL, true);
auto &csr = pDevice->getGpgpuCommandStreamReceiver();
auto svmAllocationsManager = ctx.getSVMAllocsManager();
auto deviceProperties = SVMAllocsManager::UnifiedMemoryProperties(InternalMemoryType::deviceUnifiedMemory, 1, ctx.getRootDeviceIndices(), ctx.getDeviceBitfields());
deviceProperties.device = pDevice;
auto unifiedMemoryAllocation = svmAllocationsManager->createUnifiedMemoryAllocation(4096u, deviceProperties);
auto unifiedMemoryGraphicsAllocation = svmAllocationsManager->getSVMAlloc(unifiedMemoryAllocation);
auto graphicsAllocation = unifiedMemoryGraphicsAllocation->gpuAllocations.getDefaultGraphicsAllocation();
// Verify that indirect allocation is always resident
kernel->makeResident(csr);
EXPECT_EQ(GraphicsAllocation::objectAlwaysResident, graphicsAllocation->getResidencyTaskCount(csr.getOsContext().getContextId()));
// Force to non-resident
graphicsAllocation->updateResidencyTaskCount(GraphicsAllocation::objectNotResident, csr.getOsContext().getContextId());
// Verify that packed allocation is tracked and makeResident is called once
kernel->makeResident(csr);
EXPECT_EQ(GraphicsAllocation::objectNotResident, graphicsAllocation->getResidencyTaskCount(csr.getOsContext().getContextId()));
svmAllocationsManager->freeSVMAlloc(unifiedMemoryAllocation);
}
HWTEST_F(KernelResidencyTest, givenKernelWhenMakeResidentIsCalledAndPackingIsDisabledThenIndirectAllocationsAreNotPacked) {
DebugManagerStateRestore dbgStateRestore;
debugManager.flags.MakeIndirectAllocationsResidentAsPack.set(0);
auto pKernelInfo = std::make_unique<MockKernelInfo>();
pKernelInfo->kernelDescriptor.kernelAttributes.simdSize = 1;
MockProgram program(toClDeviceVector(*pClDevice));
MockContext ctx;
program.setContext(&ctx);
std::unique_ptr<MockKernel> kernel(new MockKernel(&program, *pKernelInfo, *pClDevice));
ASSERT_EQ(CL_SUCCESS, kernel->initialize());
kernel->setUnifiedMemoryProperty(CL_KERNEL_EXEC_INFO_INDIRECT_DEVICE_ACCESS_INTEL, true);
auto &csr = pDevice->getGpgpuCommandStreamReceiver();
auto svmAllocationsManager = ctx.getSVMAllocsManager();
auto deviceProperties = SVMAllocsManager::UnifiedMemoryProperties(InternalMemoryType::deviceUnifiedMemory, 1, ctx.getRootDeviceIndices(), ctx.getDeviceBitfields());
deviceProperties.device = pDevice;
auto unifiedMemoryAllocation = svmAllocationsManager->createUnifiedMemoryAllocation(4096u, deviceProperties);
auto unifiedMemoryGraphicsAllocation = svmAllocationsManager->getSVMAlloc(unifiedMemoryAllocation);
auto graphicsAllocation = unifiedMemoryGraphicsAllocation->gpuAllocations.getDefaultGraphicsAllocation();
kernel->makeResident(csr);
EXPECT_EQ(1u, graphicsAllocation->getResidencyTaskCount(csr.getOsContext().getContextId()));
// Force to non-resident
graphicsAllocation->updateResidencyTaskCount(GraphicsAllocation::objectNotResident, csr.getOsContext().getContextId());
// Verify that makeResident is always called when allocation is not packed
kernel->makeResident(csr);
EXPECT_EQ(1u, graphicsAllocation->getResidencyTaskCount(csr.getOsContext().getContextId()));
svmAllocationsManager->freeSVMAlloc(unifiedMemoryAllocation);
}
struct KernelExecutionEnvironmentTest : public Test<ClDeviceFixture> {
void SetUp() override {
ClDeviceFixture::setUp();

View File

@@ -909,4 +909,15 @@ void SVMAllocsManager::makeResidentForAllocationsWithId(uint32_t allocationId, C
}
}
bool SVMAllocsManager::submitIndirectAllocationsAsPack(CommandStreamReceiver &csr) {
auto submitAsPack = memoryManager->allowIndirectAllocationsAsPack(csr.getRootDeviceIndex());
if (debugManager.flags.MakeIndirectAllocationsResidentAsPack.get() != -1) {
submitAsPack = !!NEO::debugManager.flags.MakeIndirectAllocationsResidentAsPack.get();
}
if (submitAsPack) {
makeIndirectAllocationsResident(csr, csr.peekTaskCount() + 1u);
}
return submitAsPack;
}
} // namespace NEO

View File

@@ -238,6 +238,8 @@ class SVMAllocsManager {
void initUsmAllocationsCaches(Device &device);
bool submitIndirectAllocationsAsPack(CommandStreamReceiver &csr);
protected:
void *createZeroCopySvmAllocation(size_t size, const SvmAllocationProperties &svmProperties,
const RootDeviceIndicesContainer &rootDeviceIndices,

View File

@@ -464,7 +464,7 @@ TEST_F(SVMLocalMemoryAllocatorTest, givenInternalAllocationWhenItIsMadeResidentT
EXPECT_EQ(0u, svmManager->indirectAllocationsResidency.size());
svmManager->makeIndirectAllocationsResident(*csr, 1u);
EXPECT_TRUE(svmManager->submitIndirectAllocationsAsPack(*csr));
EXPECT_TRUE(graphicsAllocation->gpuAllocations.getDefaultGraphicsAllocation()->isResident(csr->getOsContext().getContextId()));
EXPECT_EQ(GraphicsAllocation::objectAlwaysResident, graphicsAllocation->gpuAllocations.getDefaultGraphicsAllocation()->getResidencyTaskCount(csr->getOsContext().getContextId()));
EXPECT_FALSE(graphicsAllocation->gpuAllocations.getDefaultGraphicsAllocation()->peekEvictable());
@@ -477,6 +477,39 @@ TEST_F(SVMLocalMemoryAllocatorTest, givenInternalAllocationWhenItIsMadeResidentT
svmManager->freeSVMAlloc(ptr);
}
TEST_F(SVMLocalMemoryAllocatorTest, whenSubmitIndirectAllocationsAsPackCalledButAllocationsAsPackNotAllowedThenDontMakeResident) {
DebugManagerStateRestore restore;
debugManager.flags.MakeIndirectAllocationsResidentAsPack.set(0);
std::unique_ptr<UltDeviceFactory> deviceFactory(new UltDeviceFactory(1, 2));
auto device = deviceFactory->rootDevices[0];
auto memoryManager = static_cast<MockMemoryManager *>(device->getMemoryManager());
auto svmManager = std::make_unique<MockSVMAllocsManager>(memoryManager, false);
auto csr = std::make_unique<MockCommandStreamReceiver>(*device->getExecutionEnvironment(), device->getRootDeviceIndex(), device->getDeviceBitfield());
csr->setupContext(*device->getDefaultEngine().osContext);
void *cmdQ = reinterpret_cast<void *>(0x12345);
auto mockPageFaultManager = new MockPageFaultManager();
memoryManager->pageFaultManager.reset(mockPageFaultManager);
SVMAllocsManager::UnifiedMemoryProperties unifiedMemoryProperties(InternalMemoryType::sharedUnifiedMemory, 1, rootDeviceIndices, deviceBitfields);
auto ptr = svmManager->createSharedUnifiedMemoryAllocation(4096u, unifiedMemoryProperties, &cmdQ);
ASSERT_NE(nullptr, ptr);
auto graphicsAllocation = svmManager->getSVMAlloc(ptr);
EXPECT_FALSE(graphicsAllocation->gpuAllocations.getDefaultGraphicsAllocation()->isResident(csr->getOsContext().getContextId()));
EXPECT_EQ(0u, svmManager->indirectAllocationsResidency.size());
EXPECT_FALSE(svmManager->submitIndirectAllocationsAsPack(*csr));
EXPECT_FALSE(graphicsAllocation->gpuAllocations.getDefaultGraphicsAllocation()->isResident(csr->getOsContext().getContextId()));
EXPECT_EQ(0u, svmManager->indirectAllocationsResidency.size());
EXPECT_EQ(svmManager->indirectAllocationsResidency.find(csr.get()), svmManager->indirectAllocationsResidency.end());
svmManager->freeSVMAlloc(ptr);
}
TEST_F(SVMLocalMemoryAllocatorTest, givenInternalAllocationWhenItIsMadeResidentThenSubsequentCallsDoNotCallResidency) {
std::unique_ptr<UltDeviceFactory> deviceFactory(new UltDeviceFactory(1, 2));
auto device = deviceFactory->rootDevices[0];