performance: use BCS for transfers if CCS is busy

Related-To: NEO-11501

Also, if device is iGPU, don't use staging buffers
in that case.

Signed-off-by: Szymon Morek <szymon.morek@intel.com>
This commit is contained in:
Szymon Morek
2024-07-19 10:25:15 +00:00
committed by Compute-Runtime-Automation
parent 17380dcbf3
commit 39ec7facee
18 changed files with 211 additions and 26 deletions

View File

@@ -2607,4 +2607,55 @@ HWTEST_F(StagingBufferTest, givenIsValidForStagingBufferCopyWhenSrcIsMappedThenR
MockCommandQueueHw<FamilyType> myCmdQ(context, pClDevice, 0);
auto [buffer, mappedPtr] = createBufferAndMapItOnGpu();
EXPECT_FALSE(myCmdQ.isValidForStagingBufferCopy(pClDevice->getDevice(), dstPtr, mappedPtr, buffer->getSize(), false));
}
}
HWTEST_F(StagingBufferTest, givenIsValidForStagingBufferCopyWhenIsNotLocalMemoryAndOOQAndGpuBusyThenReturnFalse) {
DebugManagerStateRestore restore{};
debugManager.flags.EnableCopyWithStagingBuffers.set(1);
auto mockDevice = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(defaultHwInfo.get()));
auto mockContext = std::make_unique<MockContext>(mockDevice.get());
MockCommandQueueHw<FamilyType> myCmdQ(mockContext.get(), mockDevice.get(), 0);
SVMAllocsManager::UnifiedMemoryProperties unifiedMemoryProperties(InternalMemoryType::deviceUnifiedMemory, 1, mockContext->getRootDeviceIndices(), mockContext->getDeviceBitfields());
unifiedMemoryProperties.device = &mockDevice->getDevice();
auto dstPtr = mockContext->getSVMAllocsManager()->createUnifiedMemoryAllocation(copySize, unifiedMemoryProperties);
auto ccsCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(&myCmdQ.getGpgpuCommandStreamReceiver());
*ccsCsr->tagAddress = 0u;
ccsCsr->taskCount = 0u;
EXPECT_TRUE(myCmdQ.isValidForStagingBufferCopy(mockDevice->getDevice(), dstPtr, srcPtr, 1024ul, false));
*ccsCsr->tagAddress = 0u;
ccsCsr->taskCount = 1u;
EXPECT_TRUE(myCmdQ.isValidForStagingBufferCopy(mockDevice->getDevice(), dstPtr, srcPtr, 1024ul, false));
myCmdQ.setOoqEnabled();
EXPECT_FALSE(myCmdQ.isValidForStagingBufferCopy(mockDevice->getDevice(), dstPtr, srcPtr, 1024ul, false));
mockContext->getSVMAllocsManager()->freeSVMAlloc(dstPtr);
}
HWTEST_F(StagingBufferTest, givenIsValidForStagingBufferCopyWhenIsLocalMemoryAndOOQAndGpuBusyThenReturnTrue) {
DebugManagerStateRestore restore{};
debugManager.flags.EnableCopyWithStagingBuffers.set(1);
debugManager.flags.EnableLocalMemory.set(1);
auto mockDevice = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(defaultHwInfo.get()));
auto mockContext = std::make_unique<MockContext>(mockDevice.get());
MockCommandQueueHw<FamilyType> myCmdQ(mockContext.get(), mockDevice.get(), 0);
SVMAllocsManager::UnifiedMemoryProperties unifiedMemoryProperties(InternalMemoryType::deviceUnifiedMemory, 1, mockContext->getRootDeviceIndices(), mockContext->getDeviceBitfields());
unifiedMemoryProperties.device = &mockDevice->getDevice();
auto dstPtr = mockContext->getSVMAllocsManager()->createUnifiedMemoryAllocation(copySize, unifiedMemoryProperties);
auto ccsCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(&myCmdQ.getGpgpuCommandStreamReceiver());
*ccsCsr->tagAddress = 0u;
ccsCsr->taskCount = 0u;
EXPECT_TRUE(myCmdQ.isValidForStagingBufferCopy(mockDevice->getDevice(), dstPtr, srcPtr, 1024ul, false));
*ccsCsr->tagAddress = 0u;
ccsCsr->taskCount = 1u;
myCmdQ.setOoqEnabled();
EXPECT_TRUE(myCmdQ.isValidForStagingBufferCopy(mockDevice->getDevice(), dstPtr, srcPtr, 1024ul, false));
mockContext->getSVMAllocsManager()->freeSVMAlloc(dstPtr);
}