performance: enable staging write for cl buffers

Related-To: NEO-13529

Also, add size threshold on iGPU on Linux,
and disable staging if imported host ptr could
be reused

Signed-off-by: Szymon Morek <szymon.morek@intel.com>
This commit is contained in:
Szymon Morek
2025-02-04 10:24:31 +00:00
committed by Compute-Runtime-Automation
parent 35d8e82664
commit b11322332c
14 changed files with 174 additions and 22 deletions

View File

@@ -29,6 +29,7 @@
#include "shared/source/os_interface/os_context.h"
#include "shared/source/os_interface/product_helper.h"
#include "shared/source/utilities/api_intercept.h"
#include "shared/source/utilities/staging_buffer_manager.h"
#include "shared/source/utilities/tag_allocator.h"
#include "opencl/source/built_ins/builtins_dispatch_builder.h"
@@ -548,7 +549,9 @@ WaitStatus CommandQueue::waitUntilComplete(TaskCountType gpgpuTaskCountToWait, R
: getGpgpuCommandStreamReceiver().waitForTaskCount(gpgpuTaskCountToWait);
WAIT_LEAVE()
if (this->context->getStagingBufferManager()) {
this->context->getStagingBufferManager()->resetDetectedPtrs();
}
return waitStatus;
}

View File

@@ -173,7 +173,8 @@ bool CommandQueue::isValidForStagingTransfer(MemObj *memObj, const void *ptr, bo
switch (memObj->peekClMemObjType()) {
case CL_MEM_OBJECT_IMAGE1D:
case CL_MEM_OBJECT_IMAGE2D:
return stagingBufferManager->isValidForStagingTransfer(this->getDevice(), ptr, hasDependencies);
case CL_MEM_OBJECT_BUFFER:
return stagingBufferManager->isValidForStagingTransfer(this->getDevice(), ptr, memObj->getSize(), hasDependencies);
default:
return false;
}

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2018-2024 Intel Corporation
* Copyright (C) 2018-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -21,6 +21,7 @@
#include "shared/source/memory_manager/internal_allocation_storage.h"
#include "shared/source/utilities/perf_counter.h"
#include "shared/source/utilities/range.h"
#include "shared/source/utilities/staging_buffer_manager.h"
#include "shared/source/utilities/tag_allocator.h"
#include "opencl/extensions/public/cl_ext_private.h"
@@ -553,6 +554,9 @@ void Event::updateExecutionStatus() {
auto *allocationStorage = cmdQueue->getGpgpuCommandStreamReceiver().getInternalAllocationStorage();
allocationStorage->cleanAllocationList(this->taskCount, TEMPORARY_ALLOCATION);
allocationStorage->cleanAllocationList(this->taskCount, DEFERRED_DEALLOCATION);
if (cmdQueue->getContext().getStagingBufferManager()) {
cmdQueue->getContext().getStagingBufferManager()->resetDetectedPtrs();
}
return;
}

View File

@@ -660,6 +660,45 @@ HWTEST_F(WriteBufferStagingBufferTest, whenEnqueueStagingWriteBufferCalledThenRe
EXPECT_EQ(0u, csr.createAllocationForHostSurfaceCalled);
}
HWTEST_F(WriteBufferStagingBufferTest, whenHostPtrRegisteredThenDontUseStagingUntilEventCompleted) {
DebugManagerStateRestore restorer;
debugManager.flags.EnableCopyWithStagingBuffers.set(1);
MockCommandQueueHw<FamilyType> mockCommandQueueHw(context.get(), device.get(), &props);
cl_event event;
auto retVal = mockCommandQueueHw.enqueueWriteBuffer(&buffer,
CL_FALSE,
0,
MemoryConstants::cacheLineSize,
ptr,
nullptr,
0,
nullptr,
&event);
EXPECT_EQ(CL_SUCCESS, retVal);
auto pEvent = castToObjectOrAbort<Event>(event);
EXPECT_TRUE(mockCommandQueueHw.isValidForStagingTransfer(&buffer, ptr, false));
EXPECT_FALSE(mockCommandQueueHw.isValidForStagingTransfer(&buffer, ptr, false));
pEvent->updateExecutionStatus();
EXPECT_TRUE(mockCommandQueueHw.isValidForStagingTransfer(&buffer, ptr, false));
pEvent->release();
}
HWTEST_F(WriteBufferStagingBufferTest, whenHostPtrRegisteredThenDontUseStagingUntilFinishCalled) {
DebugManagerStateRestore restorer;
debugManager.flags.EnableCopyWithStagingBuffers.set(1);
MockCommandQueueHw<FamilyType> mockCommandQueueHw(context.get(), device.get(), &props);
EXPECT_TRUE(mockCommandQueueHw.isValidForStagingTransfer(&buffer, ptr, false));
EXPECT_FALSE(mockCommandQueueHw.isValidForStagingTransfer(&buffer, ptr, false));
mockCommandQueueHw.finish();
EXPECT_TRUE(mockCommandQueueHw.isValidForStagingTransfer(&buffer, ptr, false));
}
HWTEST_F(WriteBufferStagingBufferTest, whenEnqueueStagingWriteBufferCalledWithLargeSizeThenSplitTransfer) {
auto hostPtr = new unsigned char[chunkSize * 4];
MockCommandQueueHw<FamilyType> mockCommandQueueHw(context.get(), device.get(), &props);
@@ -731,3 +770,11 @@ HWTEST_F(WriteBufferStagingBufferTest, whenEnqueueStagingWriteBufferFailedThenPr
EXPECT_EQ(res, CL_INVALID_OPERATION);
EXPECT_EQ(1ul, mockCommandQueueHw.enqueueWriteBufferCounter);
}
HWTEST_F(WriteBufferStagingBufferTest, whenIsValidForStagingTransferCalledThenReturnCorrectValue) {
MockCommandQueueHw<FamilyType> mockCommandQueueHw(context.get(), device.get(), &props);
auto isStagingBuffersEnabled = device->getProductHelper().isStagingBuffersEnabled();
unsigned char ptr[16];
EXPECT_EQ(isStagingBuffersEnabled, mockCommandQueueHw.isValidForStagingTransfer(&buffer, ptr, false));
}

View File

@@ -811,6 +811,7 @@ HWTEST_F(EnqueueWriteImageTest, whenisValidForStagingTransferCalledThenReturnCor
std::unique_ptr<Image> image(Image1dHelper<>::create(context));
EXPECT_EQ(isStagingBuffersEnabled, pCmdQ->isValidForStagingTransfer(image.get(), ptr, false));
pCmdQ->finish();
image.reset(Image2dHelper<>::create(context));
EXPECT_EQ(isStagingBuffersEnabled, pCmdQ->isValidForStagingTransfer(image.get(), ptr, false));

View File

@@ -1176,7 +1176,8 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, givenUpdateTaskCountFromWaitSetWhe
DebugManagerStateRestore restorer;
debugManager.flags.UpdateTaskCountFromWait.set(3);
CommandQueueHw<FamilyType> commandQueue(nullptr, pClDevice, 0, false);
MockContext context(pClDevice);
CommandQueueHw<FamilyType> commandQueue(&context, pClDevice, 0, false);
commandQueue.taskCount = 10;
auto mockCsr = new MockCsrHw2<FamilyType>(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield());
@@ -1220,7 +1221,8 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, givenEnabledDirectSubmissionUpdate
}
};
CommandQueueHw<FamilyType> commandQueue(nullptr, pClDevice, 0, false);
MockContext context(pClDevice);
CommandQueueHw<FamilyType> commandQueue(&context, pClDevice, 0, false);
commandQueue.taskCount = 10;
auto mockCsr = new MockCsrHwDirectSubmission(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield());

View File

@@ -58,6 +58,13 @@ bool initDrmOsInterface(std::unique_ptr<HwDeviceId> &&hwDeviceId, uint32_t rootD
return true;
}
bool OSInterface::isSizeWithinThresholdForStaging(size_t size, bool isIGPU) const {
if (isIGPU) {
return size < 512 * MemoryConstants::megaByte;
}
return true;
}
uint32_t OSInterface::getAggregatedProcessCount() const {
if (driverModel && driverModel->getDriverModelType() == DriverModelType::drm) {
return driverModel->as<Drm>()->getAggregatedProcessCount();

View File

@@ -117,6 +117,7 @@ class OSInterface : public NonCopyableClass {
MOCKABLE_VIRTUAL bool isDebugAttachAvailable() const;
MOCKABLE_VIRTUAL bool isLockablePointer(bool isLockable) const;
MOCKABLE_VIRTUAL bool isSizeWithinThresholdForStaging(size_t size, bool isIGPU) const;
MOCKABLE_VIRTUAL uint32_t getAggregatedProcessCount() const;
static bool osEnabled64kbPages;

View File

@@ -26,6 +26,10 @@ bool OSInterface::isLockablePointer(bool isLockable) const {
return isLockable;
}
bool OSInterface::isSizeWithinThresholdForStaging(size_t size, bool isIGPU) const {
return true;
}
uint32_t OSInterface::getAggregatedProcessCount() const {
return 0;
}

View File

@@ -10,9 +10,13 @@
#include "shared/source/command_stream/command_stream_receiver.h"
#include "shared/source/debug_settings/debug_settings_manager.h"
#include "shared/source/device/device.h"
#include "shared/source/execution_environment/root_device_environment.h"
#include "shared/source/helpers/aligned_memory.h"
#include "shared/source/helpers/hw_info.h"
#include "shared/source/memory_manager/unified_memory_manager.h"
#include "shared/source/os_interface/os_interface.h"
#include "shared/source/utilities/heap_allocator.h"
namespace NEO {
StagingBuffer::StagingBuffer(void *baseAddress, size_t size) : baseAddress(baseAddress) {
@@ -285,11 +289,7 @@ void *StagingBufferManager::allocateStagingBuffer(size_t size) {
return hostPtr;
}
bool StagingBufferManager::isValidForCopy(const Device &device, void *dstPtr, const void *srcPtr, size_t size, bool hasDependencies, uint32_t osContextId) const {
auto stagingCopyEnabled = device.getProductHelper().isStagingBuffersEnabled();
if (debugManager.flags.EnableCopyWithStagingBuffers.get() != -1) {
stagingCopyEnabled = debugManager.flags.EnableCopyWithStagingBuffers.get();
}
bool StagingBufferManager::isValidForCopy(const Device &device, void *dstPtr, const void *srcPtr, size_t size, bool hasDependencies, uint32_t osContextId) {
auto usmDstData = svmAllocsManager->getSVMAlloc(dstPtr);
auto usmSrcData = svmAllocsManager->getSVMAlloc(srcPtr);
bool hostToUsmCopy = usmSrcData == nullptr && usmDstData != nullptr;
@@ -297,16 +297,25 @@ bool StagingBufferManager::isValidForCopy(const Device &device, void *dstPtr, co
if (usmDstData) {
isUsedByOsContext = usmDstData->gpuAllocations.getGraphicsAllocation(device.getRootDeviceIndex())->isUsedByOsContext(osContextId);
}
return stagingCopyEnabled && hostToUsmCopy && !hasDependencies && (isUsedByOsContext || size <= chunkSize);
return this->isValidForStaging(device, srcPtr, size, hasDependencies) && hostToUsmCopy && (isUsedByOsContext || size <= chunkSize);
}
bool StagingBufferManager::isValidForStagingTransfer(const Device &device, const void *ptr, bool hasDependencies) const {
bool StagingBufferManager::isValidForStagingTransfer(const Device &device, const void *ptr, size_t size, bool hasDependencies) {
auto nonUsmPtr = ptr != nullptr && svmAllocsManager->getSVMAlloc(ptr) == nullptr;
return this->isValidForStaging(device, ptr, size, hasDependencies) && nonUsmPtr;
}
// Common checks for usm, buffers and images
bool StagingBufferManager::isValidForStaging(const Device &device, const void *ptr, size_t size, bool hasDependencies) {
auto stagingCopyEnabled = device.getProductHelper().isStagingBuffersEnabled();
if (debugManager.flags.EnableCopyWithStagingBuffers.get() != -1) {
stagingCopyEnabled = debugManager.flags.EnableCopyWithStagingBuffers.get();
}
auto nonUsmPtr = ptr != nullptr && svmAllocsManager->getSVMAlloc(ptr) == nullptr;
return stagingCopyEnabled && !hasDependencies && nonUsmPtr;
auto isIntegrated = device.getRootDeviceEnvironment().getHardwareInfo()->capabilityTable.isIntegratedDevice;
auto osInterface = device.getRootDeviceEnvironment().osInterface.get();
bool sizeWithinThreshold = osInterface ? osInterface->isSizeWithinThresholdForStaging(size, isIntegrated) : true;
auto detectedHostPtr = this->registerHostPtr(ptr);
return stagingCopyEnabled && !hasDependencies && !detectedHostPtr && sizeWithinThreshold;
}
void StagingBufferManager::clearTrackedChunks() {
@@ -325,4 +334,16 @@ void StagingBufferManager::trackChunk(const StagingBufferTracker &tracker) {
trackers.push_back(tracker);
}
bool StagingBufferManager::registerHostPtr(const void *ptr) {
auto lock = std::lock_guard<std::mutex>(mtx);
auto isHostPtrDetected = detectedHostPtrs.find(ptr) != detectedHostPtrs.end();
detectedHostPtrs.insert(ptr);
return isHostPtrDetected;
}
void StagingBufferManager::resetDetectedPtrs() {
auto lock = std::lock_guard<std::mutex>(mtx);
detectedHostPtrs.clear();
}
} // namespace NEO

View File

@@ -16,6 +16,7 @@
#include <memory>
#include <mutex>
#include <queue>
#include <set>
namespace NEO {
class SVMAllocsManager;
@@ -78,8 +79,8 @@ class StagingBufferManager {
StagingBufferManager &operator=(StagingBufferManager &&other) noexcept = delete;
StagingBufferManager &operator=(const StagingBufferManager &other) = delete;
bool isValidForCopy(const Device &device, void *dstPtr, const void *srcPtr, size_t size, bool hasDependencies, uint32_t osContextId) const;
bool isValidForStagingTransfer(const Device &device, const void *ptr, bool hasDependencies) const;
bool isValidForCopy(const Device &device, void *dstPtr, const void *srcPtr, size_t size, bool hasDependencies, uint32_t osContextId);
bool isValidForStagingTransfer(const Device &device, const void *ptr, size_t size, bool hasDependencies);
StagingTransferStatus performCopy(void *dstPtr, const void *srcPtr, size_t size, ChunkCopyFunction &chunkCopyFunc, CommandStreamReceiver *csr);
StagingTransferStatus performImageTransfer(const void *ptr, const size_t *globalOrigin, const size_t *globalRegion, size_t rowPitch, ChunkTransferImageFunc &chunkTransferImageFunc, CommandStreamReceiver *csr, bool isRead);
@@ -88,6 +89,9 @@ class StagingBufferManager {
std::pair<HeapAllocator *, uint64_t> requestStagingBuffer(size_t &size);
void trackChunk(const StagingBufferTracker &tracker);
bool registerHostPtr(const void *ptr);
void resetDetectedPtrs();
private:
std::pair<HeapAllocator *, uint64_t> getExistingBuffer(size_t &size);
void *allocateStagingBuffer(size_t size);
@@ -99,6 +103,8 @@ class StagingBufferManager {
WaitStatus fetchHead(StagingQueue &stagingQueue, StagingBufferTracker &tracker) const;
WaitStatus drainAndReleaseStagingQueue(StagingQueue &stagingQueue) const;
bool isValidForStaging(const Device &device, const void *ptr, size_t size, bool hasDependencies);
size_t chunkSize = MemoryConstants::pageSize2M;
std::mutex mtx;
std::vector<StagingBuffer> stagingBuffers;
@@ -108,6 +114,8 @@ class StagingBufferManager {
const RootDeviceIndicesContainer rootDeviceIndices;
const std::map<uint32_t, DeviceBitfield> deviceBitfields;
const bool requiresWritable = false;
std::set<const void *> detectedHostPtrs;
};
} // namespace NEO

View File

@@ -94,4 +94,15 @@ TEST(OsInterfaceTest, whenOsInterfaceSetupGmmInputArgsThenArgsAreSet) {
EXPECT_EQ(GMM_CLIENT::GMM_OCL_VISTA, passedInputArgs.ClientType);
}
TEST(OsInterfaceTest, GivenLinuxOsInterfaceWhenGetThresholdForStagingCalledThenReturnThresholdForIntegratedDevices) {
OSInterface osInterface;
auto executionEnvironment = std::make_unique<MockExecutionEnvironment>();
DrmMock *drm = new DrmMock(*executionEnvironment->rootDeviceEnvironments[0]);
osInterface.setDriverModel(std::unique_ptr<DriverModel>(drm));
EXPECT_TRUE(osInterface.isSizeWithinThresholdForStaging(MemoryConstants::gigaByte, false));
EXPECT_FALSE(osInterface.isSizeWithinThresholdForStaging(MemoryConstants::gigaByte, true));
}
} // namespace NEO

View File

@@ -153,3 +153,14 @@ TEST_F(OsInterfaceTest, givenEnableFtrTile64OptimizationDebugKeyWhenSetThenPrope
EXPECT_EQ(1u, passedFtrTable.FtrTile64Optimization);
}
}
TEST_F(OsInterfaceTest, whenGetThresholdForStagingCalledThenReturnNoThreshold) {
MockExecutionEnvironment executionEnvironment;
auto &rootDeviceEnvironment = *executionEnvironment.rootDeviceEnvironments[0];
auto wddm = new WddmMock(rootDeviceEnvironment);
EXPECT_EQ(nullptr, rootDeviceEnvironment.osInterface.get());
wddm->init();
EXPECT_NE(nullptr, rootDeviceEnvironment.osInterface.get());
EXPECT_TRUE(rootDeviceEnvironment.osInterface->isSizeWithinThresholdForStaging(MemoryConstants::gigaByte, false));
EXPECT_TRUE(rootDeviceEnvironment.osInterface->isSizeWithinThresholdForStaging(MemoryConstants::gigaByte, true));
}

View File

@@ -5,6 +5,7 @@
*
*/
#include "shared/source/os_interface/os_interface.h"
#include "shared/source/utilities/staging_buffer_manager.h"
#include "shared/test/common/fixtures/device_fixture.h"
#include "shared/test/common/helpers/debug_manager_state_restore.h"
@@ -20,6 +21,13 @@
using namespace NEO;
struct MockOSIface : OSInterface {
bool isSizeWithinThresholdForStaging(size_t size, bool isIGPU) const override {
return isSizeWithinThresholdValue;
}
bool isSizeWithinThresholdValue = true;
};
class StagingBufferManagerFixture : public DeviceFixture {
public:
void setUp() {
@@ -31,6 +39,8 @@ class StagingBufferManagerFixture : public DeviceFixture {
std::map<uint32_t, DeviceBitfield> deviceBitfields{{mockRootDeviceIndex, mockDeviceBitfield}};
this->stagingBufferManager = std::make_unique<StagingBufferManager>(svmAllocsManager.get(), rootDeviceIndices, deviceBitfields, false);
this->csr = pDevice->commandStreamReceivers[0].get();
this->osInterface = new MockOSIface{};
this->pDevice->getRootDeviceEnvironmentRef().osInterface.reset(this->osInterface);
}
void tearDown() {
@@ -165,6 +175,7 @@ class StagingBufferManagerFixture : public DeviceFixture {
std::unique_ptr<MockSVMAllocsManager> svmAllocsManager;
std::unique_ptr<StagingBufferManager> stagingBufferManager;
CommandStreamReceiver *csr;
MockOSIface *osInterface;
};
using StagingBufferManagerTest = Test<StagingBufferManagerFixture>;
@@ -199,18 +210,31 @@ TEST_F(StagingBufferManagerTest, givenStagingBufferEnabledWhenValidForCopyThenRe
}
auto actualValid = stagingBufferManager->isValidForCopy(*pDevice, copyParamsStruct[i].dstPtr, copyParamsStruct[i].srcPtr, copyParamsStruct[i].size, copyParamsStruct[i].hasDependencies, 0u);
EXPECT_EQ(actualValid, copyParamsStruct[i].expectValid);
stagingBufferManager->resetDetectedPtrs();
}
debugManager.flags.EnableCopyWithStagingBuffers.set(0);
EXPECT_FALSE(stagingBufferManager->isValidForCopy(*pDevice, usmBuffer, nonUsmBuffer, bufferSize, false, 0u));
stagingBufferManager->resetDetectedPtrs();
debugManager.flags.EnableCopyWithStagingBuffers.set(-1);
auto isStaingBuffersEnabled = pDevice->getProductHelper().isStagingBuffersEnabled();
EXPECT_EQ(isStaingBuffersEnabled, stagingBufferManager->isValidForCopy(*pDevice, usmBuffer, nonUsmBuffer, bufferSize, false, 0u));
auto isStagingBuffersEnabled = pDevice->getProductHelper().isStagingBuffersEnabled();
EXPECT_EQ(isStagingBuffersEnabled, stagingBufferManager->isValidForCopy(*pDevice, usmBuffer, nonUsmBuffer, bufferSize, false, 0u));
stagingBufferManager->registerHostPtr(nonUsmBuffer);
EXPECT_FALSE(stagingBufferManager->isValidForCopy(*pDevice, usmBuffer, nonUsmBuffer, bufferSize, false, 0u));
stagingBufferManager->resetDetectedPtrs();
this->osInterface->isSizeWithinThresholdValue = false;
EXPECT_FALSE(stagingBufferManager->isValidForCopy(*pDevice, usmBuffer, nonUsmBuffer, bufferSize, false, 0u));
stagingBufferManager->resetDetectedPtrs();
this->pDevice->getRootDeviceEnvironmentRef().osInterface.reset(nullptr);
EXPECT_EQ(isStagingBuffersEnabled, stagingBufferManager->isValidForCopy(*pDevice, usmBuffer, nonUsmBuffer, bufferSize, false, 0u));
svmAllocsManager->freeSVMAlloc(usmBuffer);
}
TEST_F(StagingBufferManagerTest, givenStagingBufferEnabledWhenValidForImageWriteThenReturnCorrectValue) {
TEST_F(StagingBufferManagerTest, givenStagingBufferEnabledWhenValidForStagingTransferThenReturnCorrectValue) {
constexpr size_t bufferSize = 1024;
auto usmBuffer = allocateDeviceBuffer(bufferSize);
unsigned char nonUsmBuffer[bufferSize];
@@ -226,16 +250,23 @@ TEST_F(StagingBufferManagerTest, givenStagingBufferEnabledWhenValidForImageWrite
{nonUsmBuffer, true, false},
};
for (auto i = 0; i < 4; i++) {
auto actualValid = stagingBufferManager->isValidForStagingTransfer(*pDevice, copyParamsStruct[i].ptr, copyParamsStruct[i].hasDependencies);
auto actualValid = stagingBufferManager->isValidForStagingTransfer(*pDevice, copyParamsStruct[i].ptr, bufferSize, copyParamsStruct[i].hasDependencies);
EXPECT_EQ(actualValid, copyParamsStruct[i].expectValid);
}
debugManager.flags.EnableCopyWithStagingBuffers.set(0);
EXPECT_FALSE(stagingBufferManager->isValidForStagingTransfer(*pDevice, nonUsmBuffer, false));
EXPECT_FALSE(stagingBufferManager->isValidForStagingTransfer(*pDevice, nonUsmBuffer, bufferSize, false));
debugManager.flags.EnableCopyWithStagingBuffers.set(-1);
auto isStaingBuffersEnabled = pDevice->getProductHelper().isStagingBuffersEnabled();
EXPECT_EQ(isStaingBuffersEnabled, stagingBufferManager->isValidForStagingTransfer(*pDevice, nonUsmBuffer, false));
stagingBufferManager->resetDetectedPtrs();
EXPECT_EQ(isStaingBuffersEnabled, stagingBufferManager->isValidForStagingTransfer(*pDevice, nonUsmBuffer, bufferSize, false));
EXPECT_FALSE(stagingBufferManager->isValidForStagingTransfer(*pDevice, usmBuffer, bufferSize, false));
stagingBufferManager->registerHostPtr(nonUsmBuffer);
EXPECT_FALSE(stagingBufferManager->isValidForStagingTransfer(*pDevice, nonUsmBuffer, bufferSize, false));
svmAllocsManager->freeSVMAlloc(usmBuffer);
}