performance: enable staging write for cl buffers

Related-To: NEO-13529

Also, add size threshold on iGPU on Linux,
and disable staging if imported host ptr could
be reused

Signed-off-by: Szymon Morek <szymon.morek@intel.com>
This commit is contained in:
Szymon Morek
2025-02-04 10:24:31 +00:00
committed by Compute-Runtime-Automation
parent 35d8e82664
commit b11322332c
14 changed files with 174 additions and 22 deletions

View File

@@ -58,6 +58,13 @@ bool initDrmOsInterface(std::unique_ptr<HwDeviceId> &&hwDeviceId, uint32_t rootD
return true;
}
bool OSInterface::isSizeWithinThresholdForStaging(size_t size, bool isIGPU) const {
if (isIGPU) {
return size < 512 * MemoryConstants::megaByte;
}
return true;
}
uint32_t OSInterface::getAggregatedProcessCount() const {
if (driverModel && driverModel->getDriverModelType() == DriverModelType::drm) {
return driverModel->as<Drm>()->getAggregatedProcessCount();

View File

@@ -117,6 +117,7 @@ class OSInterface : public NonCopyableClass {
MOCKABLE_VIRTUAL bool isDebugAttachAvailable() const;
MOCKABLE_VIRTUAL bool isLockablePointer(bool isLockable) const;
MOCKABLE_VIRTUAL bool isSizeWithinThresholdForStaging(size_t size, bool isIGPU) const;
MOCKABLE_VIRTUAL uint32_t getAggregatedProcessCount() const;
static bool osEnabled64kbPages;

View File

@@ -26,6 +26,10 @@ bool OSInterface::isLockablePointer(bool isLockable) const {
return isLockable;
}
bool OSInterface::isSizeWithinThresholdForStaging(size_t size, bool isIGPU) const {
return true;
}
uint32_t OSInterface::getAggregatedProcessCount() const {
return 0;
}

View File

@@ -10,9 +10,13 @@
#include "shared/source/command_stream/command_stream_receiver.h"
#include "shared/source/debug_settings/debug_settings_manager.h"
#include "shared/source/device/device.h"
#include "shared/source/execution_environment/root_device_environment.h"
#include "shared/source/helpers/aligned_memory.h"
#include "shared/source/helpers/hw_info.h"
#include "shared/source/memory_manager/unified_memory_manager.h"
#include "shared/source/os_interface/os_interface.h"
#include "shared/source/utilities/heap_allocator.h"
namespace NEO {
StagingBuffer::StagingBuffer(void *baseAddress, size_t size) : baseAddress(baseAddress) {
@@ -285,11 +289,7 @@ void *StagingBufferManager::allocateStagingBuffer(size_t size) {
return hostPtr;
}
bool StagingBufferManager::isValidForCopy(const Device &device, void *dstPtr, const void *srcPtr, size_t size, bool hasDependencies, uint32_t osContextId) const {
auto stagingCopyEnabled = device.getProductHelper().isStagingBuffersEnabled();
if (debugManager.flags.EnableCopyWithStagingBuffers.get() != -1) {
stagingCopyEnabled = debugManager.flags.EnableCopyWithStagingBuffers.get();
}
bool StagingBufferManager::isValidForCopy(const Device &device, void *dstPtr, const void *srcPtr, size_t size, bool hasDependencies, uint32_t osContextId) {
auto usmDstData = svmAllocsManager->getSVMAlloc(dstPtr);
auto usmSrcData = svmAllocsManager->getSVMAlloc(srcPtr);
bool hostToUsmCopy = usmSrcData == nullptr && usmDstData != nullptr;
@@ -297,16 +297,25 @@ bool StagingBufferManager::isValidForCopy(const Device &device, void *dstPtr, co
if (usmDstData) {
isUsedByOsContext = usmDstData->gpuAllocations.getGraphicsAllocation(device.getRootDeviceIndex())->isUsedByOsContext(osContextId);
}
return stagingCopyEnabled && hostToUsmCopy && !hasDependencies && (isUsedByOsContext || size <= chunkSize);
return this->isValidForStaging(device, srcPtr, size, hasDependencies) && hostToUsmCopy && (isUsedByOsContext || size <= chunkSize);
}
bool StagingBufferManager::isValidForStagingTransfer(const Device &device, const void *ptr, bool hasDependencies) const {
bool StagingBufferManager::isValidForStagingTransfer(const Device &device, const void *ptr, size_t size, bool hasDependencies) {
auto nonUsmPtr = ptr != nullptr && svmAllocsManager->getSVMAlloc(ptr) == nullptr;
return this->isValidForStaging(device, ptr, size, hasDependencies) && nonUsmPtr;
}
// Common checks for usm, buffers and images
bool StagingBufferManager::isValidForStaging(const Device &device, const void *ptr, size_t size, bool hasDependencies) {
auto stagingCopyEnabled = device.getProductHelper().isStagingBuffersEnabled();
if (debugManager.flags.EnableCopyWithStagingBuffers.get() != -1) {
stagingCopyEnabled = debugManager.flags.EnableCopyWithStagingBuffers.get();
}
auto nonUsmPtr = ptr != nullptr && svmAllocsManager->getSVMAlloc(ptr) == nullptr;
return stagingCopyEnabled && !hasDependencies && nonUsmPtr;
auto isIntegrated = device.getRootDeviceEnvironment().getHardwareInfo()->capabilityTable.isIntegratedDevice;
auto osInterface = device.getRootDeviceEnvironment().osInterface.get();
bool sizeWithinThreshold = osInterface ? osInterface->isSizeWithinThresholdForStaging(size, isIntegrated) : true;
auto detectedHostPtr = this->registerHostPtr(ptr);
return stagingCopyEnabled && !hasDependencies && !detectedHostPtr && sizeWithinThreshold;
}
void StagingBufferManager::clearTrackedChunks() {
@@ -325,4 +334,16 @@ void StagingBufferManager::trackChunk(const StagingBufferTracker &tracker) {
trackers.push_back(tracker);
}
bool StagingBufferManager::registerHostPtr(const void *ptr) {
auto lock = std::lock_guard<std::mutex>(mtx);
auto isHostPtrDetected = detectedHostPtrs.find(ptr) != detectedHostPtrs.end();
detectedHostPtrs.insert(ptr);
return isHostPtrDetected;
}
void StagingBufferManager::resetDetectedPtrs() {
auto lock = std::lock_guard<std::mutex>(mtx);
detectedHostPtrs.clear();
}
} // namespace NEO

View File

@@ -16,6 +16,7 @@
#include <memory>
#include <mutex>
#include <queue>
#include <set>
namespace NEO {
class SVMAllocsManager;
@@ -78,8 +79,8 @@ class StagingBufferManager {
StagingBufferManager &operator=(StagingBufferManager &&other) noexcept = delete;
StagingBufferManager &operator=(const StagingBufferManager &other) = delete;
bool isValidForCopy(const Device &device, void *dstPtr, const void *srcPtr, size_t size, bool hasDependencies, uint32_t osContextId) const;
bool isValidForStagingTransfer(const Device &device, const void *ptr, bool hasDependencies) const;
bool isValidForCopy(const Device &device, void *dstPtr, const void *srcPtr, size_t size, bool hasDependencies, uint32_t osContextId);
bool isValidForStagingTransfer(const Device &device, const void *ptr, size_t size, bool hasDependencies);
StagingTransferStatus performCopy(void *dstPtr, const void *srcPtr, size_t size, ChunkCopyFunction &chunkCopyFunc, CommandStreamReceiver *csr);
StagingTransferStatus performImageTransfer(const void *ptr, const size_t *globalOrigin, const size_t *globalRegion, size_t rowPitch, ChunkTransferImageFunc &chunkTransferImageFunc, CommandStreamReceiver *csr, bool isRead);
@@ -88,6 +89,9 @@ class StagingBufferManager {
std::pair<HeapAllocator *, uint64_t> requestStagingBuffer(size_t &size);
void trackChunk(const StagingBufferTracker &tracker);
bool registerHostPtr(const void *ptr);
void resetDetectedPtrs();
private:
std::pair<HeapAllocator *, uint64_t> getExistingBuffer(size_t &size);
void *allocateStagingBuffer(size_t size);
@@ -99,6 +103,8 @@ class StagingBufferManager {
WaitStatus fetchHead(StagingQueue &stagingQueue, StagingBufferTracker &tracker) const;
WaitStatus drainAndReleaseStagingQueue(StagingQueue &stagingQueue) const;
bool isValidForStaging(const Device &device, const void *ptr, size_t size, bool hasDependencies);
size_t chunkSize = MemoryConstants::pageSize2M;
std::mutex mtx;
std::vector<StagingBuffer> stagingBuffers;
@@ -108,6 +114,8 @@ class StagingBufferManager {
const RootDeviceIndicesContainer rootDeviceIndices;
const std::map<uint32_t, DeviceBitfield> deviceBitfields;
const bool requiresWritable = false;
std::set<const void *> detectedHostPtrs;
};
} // namespace NEO