mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-30 09:58:55 +08:00
performance: enable staging write for cl buffers
Related-To: NEO-13529 Also, add size threshold on iGPU on Linux, and disable staging if imported host ptr could be reused Signed-off-by: Szymon Morek <szymon.morek@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
35d8e82664
commit
b11322332c
@@ -58,6 +58,13 @@ bool initDrmOsInterface(std::unique_ptr<HwDeviceId> &&hwDeviceId, uint32_t rootD
|
||||
return true;
|
||||
}
|
||||
|
||||
bool OSInterface::isSizeWithinThresholdForStaging(size_t size, bool isIGPU) const {
|
||||
if (isIGPU) {
|
||||
return size < 512 * MemoryConstants::megaByte;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
uint32_t OSInterface::getAggregatedProcessCount() const {
|
||||
if (driverModel && driverModel->getDriverModelType() == DriverModelType::drm) {
|
||||
return driverModel->as<Drm>()->getAggregatedProcessCount();
|
||||
|
||||
@@ -117,6 +117,7 @@ class OSInterface : public NonCopyableClass {
|
||||
|
||||
MOCKABLE_VIRTUAL bool isDebugAttachAvailable() const;
|
||||
MOCKABLE_VIRTUAL bool isLockablePointer(bool isLockable) const;
|
||||
MOCKABLE_VIRTUAL bool isSizeWithinThresholdForStaging(size_t size, bool isIGPU) const;
|
||||
MOCKABLE_VIRTUAL uint32_t getAggregatedProcessCount() const;
|
||||
|
||||
static bool osEnabled64kbPages;
|
||||
|
||||
@@ -26,6 +26,10 @@ bool OSInterface::isLockablePointer(bool isLockable) const {
|
||||
return isLockable;
|
||||
}
|
||||
|
||||
bool OSInterface::isSizeWithinThresholdForStaging(size_t size, bool isIGPU) const {
|
||||
return true;
|
||||
}
|
||||
|
||||
uint32_t OSInterface::getAggregatedProcessCount() const {
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -10,9 +10,13 @@
|
||||
#include "shared/source/command_stream/command_stream_receiver.h"
|
||||
#include "shared/source/debug_settings/debug_settings_manager.h"
|
||||
#include "shared/source/device/device.h"
|
||||
#include "shared/source/execution_environment/root_device_environment.h"
|
||||
#include "shared/source/helpers/aligned_memory.h"
|
||||
#include "shared/source/helpers/hw_info.h"
|
||||
#include "shared/source/memory_manager/unified_memory_manager.h"
|
||||
#include "shared/source/os_interface/os_interface.h"
|
||||
#include "shared/source/utilities/heap_allocator.h"
|
||||
|
||||
namespace NEO {
|
||||
|
||||
StagingBuffer::StagingBuffer(void *baseAddress, size_t size) : baseAddress(baseAddress) {
|
||||
@@ -285,11 +289,7 @@ void *StagingBufferManager::allocateStagingBuffer(size_t size) {
|
||||
return hostPtr;
|
||||
}
|
||||
|
||||
bool StagingBufferManager::isValidForCopy(const Device &device, void *dstPtr, const void *srcPtr, size_t size, bool hasDependencies, uint32_t osContextId) const {
|
||||
auto stagingCopyEnabled = device.getProductHelper().isStagingBuffersEnabled();
|
||||
if (debugManager.flags.EnableCopyWithStagingBuffers.get() != -1) {
|
||||
stagingCopyEnabled = debugManager.flags.EnableCopyWithStagingBuffers.get();
|
||||
}
|
||||
bool StagingBufferManager::isValidForCopy(const Device &device, void *dstPtr, const void *srcPtr, size_t size, bool hasDependencies, uint32_t osContextId) {
|
||||
auto usmDstData = svmAllocsManager->getSVMAlloc(dstPtr);
|
||||
auto usmSrcData = svmAllocsManager->getSVMAlloc(srcPtr);
|
||||
bool hostToUsmCopy = usmSrcData == nullptr && usmDstData != nullptr;
|
||||
@@ -297,16 +297,25 @@ bool StagingBufferManager::isValidForCopy(const Device &device, void *dstPtr, co
|
||||
if (usmDstData) {
|
||||
isUsedByOsContext = usmDstData->gpuAllocations.getGraphicsAllocation(device.getRootDeviceIndex())->isUsedByOsContext(osContextId);
|
||||
}
|
||||
return stagingCopyEnabled && hostToUsmCopy && !hasDependencies && (isUsedByOsContext || size <= chunkSize);
|
||||
return this->isValidForStaging(device, srcPtr, size, hasDependencies) && hostToUsmCopy && (isUsedByOsContext || size <= chunkSize);
|
||||
}
|
||||
|
||||
bool StagingBufferManager::isValidForStagingTransfer(const Device &device, const void *ptr, bool hasDependencies) const {
|
||||
bool StagingBufferManager::isValidForStagingTransfer(const Device &device, const void *ptr, size_t size, bool hasDependencies) {
|
||||
auto nonUsmPtr = ptr != nullptr && svmAllocsManager->getSVMAlloc(ptr) == nullptr;
|
||||
return this->isValidForStaging(device, ptr, size, hasDependencies) && nonUsmPtr;
|
||||
}
|
||||
|
||||
// Common checks for usm, buffers and images
|
||||
bool StagingBufferManager::isValidForStaging(const Device &device, const void *ptr, size_t size, bool hasDependencies) {
|
||||
auto stagingCopyEnabled = device.getProductHelper().isStagingBuffersEnabled();
|
||||
if (debugManager.flags.EnableCopyWithStagingBuffers.get() != -1) {
|
||||
stagingCopyEnabled = debugManager.flags.EnableCopyWithStagingBuffers.get();
|
||||
}
|
||||
auto nonUsmPtr = ptr != nullptr && svmAllocsManager->getSVMAlloc(ptr) == nullptr;
|
||||
return stagingCopyEnabled && !hasDependencies && nonUsmPtr;
|
||||
auto isIntegrated = device.getRootDeviceEnvironment().getHardwareInfo()->capabilityTable.isIntegratedDevice;
|
||||
auto osInterface = device.getRootDeviceEnvironment().osInterface.get();
|
||||
bool sizeWithinThreshold = osInterface ? osInterface->isSizeWithinThresholdForStaging(size, isIntegrated) : true;
|
||||
auto detectedHostPtr = this->registerHostPtr(ptr);
|
||||
return stagingCopyEnabled && !hasDependencies && !detectedHostPtr && sizeWithinThreshold;
|
||||
}
|
||||
|
||||
void StagingBufferManager::clearTrackedChunks() {
|
||||
@@ -325,4 +334,16 @@ void StagingBufferManager::trackChunk(const StagingBufferTracker &tracker) {
|
||||
trackers.push_back(tracker);
|
||||
}
|
||||
|
||||
bool StagingBufferManager::registerHostPtr(const void *ptr) {
|
||||
auto lock = std::lock_guard<std::mutex>(mtx);
|
||||
auto isHostPtrDetected = detectedHostPtrs.find(ptr) != detectedHostPtrs.end();
|
||||
detectedHostPtrs.insert(ptr);
|
||||
return isHostPtrDetected;
|
||||
}
|
||||
|
||||
void StagingBufferManager::resetDetectedPtrs() {
|
||||
auto lock = std::lock_guard<std::mutex>(mtx);
|
||||
detectedHostPtrs.clear();
|
||||
}
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <queue>
|
||||
#include <set>
|
||||
|
||||
namespace NEO {
|
||||
class SVMAllocsManager;
|
||||
@@ -78,8 +79,8 @@ class StagingBufferManager {
|
||||
StagingBufferManager &operator=(StagingBufferManager &&other) noexcept = delete;
|
||||
StagingBufferManager &operator=(const StagingBufferManager &other) = delete;
|
||||
|
||||
bool isValidForCopy(const Device &device, void *dstPtr, const void *srcPtr, size_t size, bool hasDependencies, uint32_t osContextId) const;
|
||||
bool isValidForStagingTransfer(const Device &device, const void *ptr, bool hasDependencies) const;
|
||||
bool isValidForCopy(const Device &device, void *dstPtr, const void *srcPtr, size_t size, bool hasDependencies, uint32_t osContextId);
|
||||
bool isValidForStagingTransfer(const Device &device, const void *ptr, size_t size, bool hasDependencies);
|
||||
|
||||
StagingTransferStatus performCopy(void *dstPtr, const void *srcPtr, size_t size, ChunkCopyFunction &chunkCopyFunc, CommandStreamReceiver *csr);
|
||||
StagingTransferStatus performImageTransfer(const void *ptr, const size_t *globalOrigin, const size_t *globalRegion, size_t rowPitch, ChunkTransferImageFunc &chunkTransferImageFunc, CommandStreamReceiver *csr, bool isRead);
|
||||
@@ -88,6 +89,9 @@ class StagingBufferManager {
|
||||
std::pair<HeapAllocator *, uint64_t> requestStagingBuffer(size_t &size);
|
||||
void trackChunk(const StagingBufferTracker &tracker);
|
||||
|
||||
bool registerHostPtr(const void *ptr);
|
||||
void resetDetectedPtrs();
|
||||
|
||||
private:
|
||||
std::pair<HeapAllocator *, uint64_t> getExistingBuffer(size_t &size);
|
||||
void *allocateStagingBuffer(size_t size);
|
||||
@@ -99,6 +103,8 @@ class StagingBufferManager {
|
||||
WaitStatus fetchHead(StagingQueue &stagingQueue, StagingBufferTracker &tracker) const;
|
||||
WaitStatus drainAndReleaseStagingQueue(StagingQueue &stagingQueue) const;
|
||||
|
||||
bool isValidForStaging(const Device &device, const void *ptr, size_t size, bool hasDependencies);
|
||||
|
||||
size_t chunkSize = MemoryConstants::pageSize2M;
|
||||
std::mutex mtx;
|
||||
std::vector<StagingBuffer> stagingBuffers;
|
||||
@@ -108,6 +114,8 @@ class StagingBufferManager {
|
||||
const RootDeviceIndicesContainer rootDeviceIndices;
|
||||
const std::map<uint32_t, DeviceBitfield> deviceBitfields;
|
||||
const bool requiresWritable = false;
|
||||
|
||||
std::set<const void *> detectedHostPtrs;
|
||||
};
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
Reference in New Issue
Block a user