mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-27 07:44:16 +08:00
performance: use BCS for transfers if CCS is busy
Related-To: NEO-11501 Also, if device is iGPU, don't use staging buffers in that case. Signed-off-by: Szymon Morek <szymon.morek@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
17380dcbf3
commit
39ec7facee
@@ -302,8 +302,12 @@ CommandStreamReceiver &CommandQueue::selectCsrForBuiltinOperation(const CsrSelec
|
||||
case TransferDirection::hostToHost:
|
||||
case TransferDirection::hostToLocal:
|
||||
case TransferDirection::localToHost: {
|
||||
preferBcs = true;
|
||||
|
||||
auto isWriteToImageFromBuffer = args.dstResource.image && args.dstResource.image->isImageFromBuffer();
|
||||
auto &productHelper = device->getProductHelper();
|
||||
preferBcs = device->getRootDeviceEnvironment().isWddmOnLinux() || productHelper.blitEnqueuePreferred(isWriteToImageFromBuffer);
|
||||
if (debugManager.flags.EnableBlitterForEnqueueOperations.get() == 1) {
|
||||
preferBcs = true;
|
||||
}
|
||||
auto preferredBCSType = true;
|
||||
|
||||
if (debugManager.flags.AssignBCSAtEnqueue.get() != -1) {
|
||||
@@ -322,6 +326,11 @@ CommandStreamReceiver &CommandQueue::selectCsrForBuiltinOperation(const CsrSelec
|
||||
device->getSelectorCopyEngine(), false);
|
||||
}
|
||||
}
|
||||
|
||||
if (!preferBcs && isOOQEnabled() && getGpgpuCommandStreamReceiver().isBusy()) {
|
||||
// If CCS is preferred but it's OOQ and compute engine is busy, select BCS instead
|
||||
preferBcs = true;
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
@@ -1093,8 +1102,7 @@ bool CommandQueue::queueDependenciesClearRequired() const {
|
||||
}
|
||||
|
||||
bool CommandQueue::blitEnqueueAllowed(const CsrSelectionArgs &args) const {
|
||||
auto isWriteToImageFromBuffer = args.dstResource.image && args.dstResource.image->isImageFromBuffer();
|
||||
bool blitEnqueueAllowed = ((device->getRootDeviceEnvironment().isWddmOnLinux() || device->getRootDeviceEnvironment().getProductHelper().blitEnqueueAllowed(isWriteToImageFromBuffer)) && getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) || this->isCopyOnly;
|
||||
bool blitEnqueueAllowed = getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled() || this->isCopyOnly;
|
||||
if (debugManager.flags.EnableBlitterForEnqueueOperations.get() != -1) {
|
||||
blitEnqueueAllowed = debugManager.flags.EnableBlitterForEnqueueOperations.get();
|
||||
}
|
||||
@@ -1611,6 +1619,13 @@ bool CommandQueue::isValidForStagingBufferCopy(Device &device, void *dstPtr, con
|
||||
// Direct transfer from mapped allocation is faster than staging buffer
|
||||
return false;
|
||||
}
|
||||
auto rootDeviceIndex = device.getRootDeviceIndex();
|
||||
auto isLocalMem = device.getMemoryManager()->isLocalMemorySupported(rootDeviceIndex);
|
||||
if (isOOQEnabled() && getGpgpuCommandStreamReceiver().isBusy() && !isLocalMem) {
|
||||
// It's not beneficial to make copy through staging buffers if it's OOQ,
|
||||
// compute engine is busy and device is iGPU.
|
||||
return false;
|
||||
}
|
||||
CsrSelectionArgs csrSelectionArgs{CL_COMMAND_SVM_MEMCPY, nullptr};
|
||||
csrSelectionArgs.direction = TransferDirection::hostToLocal;
|
||||
auto csr = &selectCsrForBuiltinOperation(csrSelectionArgs);
|
||||
|
||||
Reference in New Issue
Block a user