performance: Bidirectional BCS split implementation for OCL

Related-To: NEO-7877

Signed-off-by: Lukasz Jobczyk <lukasz.jobczyk@intel.com>
This commit is contained in:
Lukasz Jobczyk
2023-05-12 10:36:06 +00:00
committed by Compute-Runtime-Automation
parent 7f24a4ba25
commit 9f755b54ad
19 changed files with 287 additions and 12 deletions

View File

@@ -7,6 +7,7 @@
#pragma once
#include "shared/source/built_ins/built_in_ops_base.h"
#include "shared/source/command_stream/transfer_direction.h"
#include "shared/source/helpers/vec.h"
#include "opencl/source/kernel/multi_device_kernel.h"
@@ -50,6 +51,7 @@ struct BuiltinOpParams {
uint32_t dstMipLevel = 0;
void *userPtrForPostOperationCpuCopy = nullptr;
bool bcsSplit = false;
TransferDirection direction = TransferDirection::LocalToLocal;
};
class BuiltinDispatchInfoBuilder {

View File

@@ -371,6 +371,13 @@ void CommandQueue::constructBcsEnginesForSplit() {
}
}
if (DebugManager.flags.SplitBcsMaskD2H.get() > 0) {
this->d2hEngines = DebugManager.flags.SplitBcsMaskD2H.get();
}
if (DebugManager.flags.SplitBcsMaskH2D.get() > 0) {
this->h2dEngines = DebugManager.flags.SplitBcsMaskH2D.get();
}
this->bcsSplitInitialized = true;
}

View File

@@ -437,6 +437,8 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
bool bcsSplitInitialized = false;
BcsInfoMask splitEngines = EngineHelpers::oddLinkedCopyEnginesMask;
BcsInfoMask h2dEngines = NEO::EngineHelpers::h2dCopyEngineMask;
BcsInfoMask d2hEngines = NEO::EngineHelpers::d2hCopyEngineMask;
LinearStream *commandStream = nullptr;

View File

@@ -1245,11 +1245,18 @@ cl_int CommandQueueHw<GfxFamily>::enqueueBlitSplit(MultiDispatchInfo &dispatchIn
auto ret = CL_SUCCESS;
this->releaseMainCopyEngine();
StackVec<std::unique_lock<CommandStreamReceiver::MutexType>, 4u> locks;
StackVec<CommandStreamReceiver *, 4u> copyEngines;
StackVec<std::unique_lock<CommandStreamReceiver::MutexType>, 2u> locks;
StackVec<CommandStreamReceiver *, 2u> copyEngines;
auto splitEngines = this->splitEngines;
if (dispatchInfo.peekBuiltinOpParams().direction == NEO::TransferDirection::HostToLocal) {
splitEngines = this->h2dEngines;
} else if (dispatchInfo.peekBuiltinOpParams().direction == NEO::TransferDirection::LocalToHost) {
splitEngines = this->d2hEngines;
}
for (uint32_t i = 0; i < bcsInfoMaskSize; i++) {
if (this->splitEngines.test(i)) {
if (splitEngines.test(i)) {
auto engineType = EngineHelpers::mapBcsIndexToEngineType(i, true);
auto bcs = getBcsCommandStreamReceiver(engineType);
if (bcs) {

View File

@@ -40,6 +40,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueCopyBuffer(
dc.dstOffset = {dstOffset, 0, 0};
dc.size = {size, 0, 0};
dc.bcsSplit = this->isSplitEnqueueBlitNeeded(csrSelectionArgs.direction, size, csr);
dc.direction = csrSelectionArgs.direction;
MultiDispatchInfo dispatchInfo(dc);

View File

@@ -53,6 +53,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueCopyBufferRect(
dc.dstRowPitch = dstRowPitch;
dc.dstSlicePitch = dstSlicePitch;
dc.bcsSplit = this->isSplitEnqueueBlitNeeded(csrSelectionArgs.direction, getTotalSizeFromRectRegion(region), csr);
dc.direction = csrSelectionArgs.direction;
MultiDispatchInfo dispatchInfo(dc);
return dispatchBcsOrGpgpuEnqueue<CL_COMMAND_COPY_BUFFER_RECT>(dispatchInfo, surfaces, eBuiltInOps, numEventsInWaitList, eventWaitList, event, false, csr);

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2018-2022 Intel Corporation
* Copyright (C) 2018-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -53,6 +53,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueCopyImage(
dc.dstMipLevel = findMipLevel(dstImage->getImageDesc().image_type, dstOrigin);
}
dc.bcsSplit = this->isSplitEnqueueBlitNeeded(csrSelectionArgs.direction, getTotalSizeFromRectRegion(region), csr);
dc.direction = csrSelectionArgs.direction;
MultiDispatchInfo dispatchInfo(dc);

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2018-2022 Intel Corporation
* Copyright (C) 2018-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -110,6 +110,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueReadBuffer(
dc.size = {size, 0, 0};
dc.transferAllocation = mapAllocation ? mapAllocation : hostPtrSurf.getAllocation();
dc.bcsSplit = bcsSplit;
dc.direction = csrSelectionArgs.direction;
MultiDispatchInfo dispatchInfo(dc);

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2018-2022 Intel Corporation
* Copyright (C) 2018-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -100,6 +100,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueReadBufferRect(
dc.dstRowPitch = hostRowPitch;
dc.dstSlicePitch = hostSlicePitch;
dc.bcsSplit = bcsSplit;
dc.direction = csrSelectionArgs.direction;
MultiDispatchInfo dispatchInfo(dc);
const auto dispatchResult = dispatchBcsOrGpgpuEnqueue<CL_COMMAND_READ_BUFFER_RECT>(dispatchInfo, surfaces, eBuiltInOps, numEventsInWaitList, eventWaitList, event, blockingRead, csr);

View File

@@ -114,6 +114,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueReadImage(
dc.userPtrForPostOperationCpuCopy = ptr;
}
dc.bcsSplit = bcsSplit;
dc.direction = csrSelectionArgs.direction;
auto eBuiltInOps = EBuiltInOps::CopyImage3dToBuffer;
MultiDispatchInfo dispatchInfo(dc);

View File

@@ -125,6 +125,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueSVMMap(cl_bool blockingMap,
dc.size = {size, 0, 0};
dc.unifiedMemoryArgsRequireMemSync = externalAppCall;
dc.bcsSplit = this->isSplitEnqueueBlitNeeded(csrSelectionArgs.direction, size, csr);
dc.direction = csrSelectionArgs.direction;
MultiDispatchInfo dispatchInfo(dc);
const auto dispatchResult = dispatchBcsOrGpgpuEnqueue<CL_COMMAND_READ_BUFFER>(dispatchInfo, surfaces, EBuiltInOps::CopyBufferToBuffer, numEventsInWaitList, eventWaitList, event, blocking, csr);
@@ -212,6 +213,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueSVMUnmap(void *svmPtr,
dc.size = {svmOperation->regionSize, 0, 0};
dc.unifiedMemoryArgsRequireMemSync = externalAppCall;
dc.bcsSplit = this->isSplitEnqueueBlitNeeded(csrSelectionArgs.direction, svmOperation->regionSize, csr);
dc.direction = csrSelectionArgs.direction;
MultiDispatchInfo dispatchInfo(dc);
const auto dispatchResult = dispatchBcsOrGpgpuEnqueue<CL_COMMAND_READ_BUFFER>(dispatchInfo, surfaces, EBuiltInOps::CopyBufferToBuffer, numEventsInWaitList, eventWaitList, event, false, csr);
@@ -385,6 +387,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueSVMMemcpy(cl_bool blockingCopy,
surfaces[1] = &dstHostPtrSurf;
operationParams.bcsSplit = bcsSplit;
operationParams.direction = csrSelectionArgs.direction;
dispatchInfo.setBuiltinOpParams(operationParams);
dispatchResult = dispatchBcsOrGpgpuEnqueue<CL_COMMAND_READ_BUFFER>(dispatchInfo, surfaces, builtInType, numEventsInWaitList, eventWaitList, event, blockingCopy, csr);
} else if (copyType == HostToSvm) {
@@ -409,6 +412,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueSVMMemcpy(cl_bool blockingCopy,
surfaces[1] = &srcHostPtrSurf;
operationParams.bcsSplit = bcsSplit;
operationParams.direction = csrSelectionArgs.direction;
dispatchInfo.setBuiltinOpParams(operationParams);
dispatchResult = dispatchBcsOrGpgpuEnqueue<CL_COMMAND_WRITE_BUFFER>(dispatchInfo, surfaces, builtInType, numEventsInWaitList, eventWaitList, event, blockingCopy, csr);
} else if (copyType == SvmToSvm) {
@@ -422,6 +426,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueSVMMemcpy(cl_bool blockingCopy,
surfaces[1] = &dstSvmSurf;
operationParams.bcsSplit = this->isSplitEnqueueBlitNeeded(csrSelectionArgs.direction, size, csr);
operationParams.direction = csrSelectionArgs.direction;
dispatchInfo.setBuiltinOpParams(operationParams);
dispatchResult = dispatchBcsOrGpgpuEnqueue<CL_COMMAND_SVM_MEMCPY>(dispatchInfo, surfaces, builtInType, numEventsInWaitList, eventWaitList, event, blockingCopy, csr);
} else {
@@ -449,6 +454,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueSVMMemcpy(cl_bool blockingCopy,
surfaces[1] = &dstHostPtrSurf;
operationParams.bcsSplit = bcsSplit;
operationParams.direction = csrSelectionArgs.direction;
dispatchInfo.setBuiltinOpParams(operationParams);
dispatchResult = dispatchBcsOrGpgpuEnqueue<CL_COMMAND_WRITE_BUFFER>(dispatchInfo, surfaces, builtInType, numEventsInWaitList, eventWaitList, event, blockingCopy, csr);
}

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2018-2022 Intel Corporation
* Copyright (C) 2018-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -99,6 +99,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueWriteBuffer(
dc.size = {size, 0, 0};
dc.transferAllocation = mapAllocation ? mapAllocation : hostPtrSurf.getAllocation();
dc.bcsSplit = bcsSplit;
dc.direction = csrSelectionArgs.direction;
MultiDispatchInfo dispatchInfo(dc);
const auto dispatchResult = dispatchBcsOrGpgpuEnqueue<CL_COMMAND_WRITE_BUFFER>(dispatchInfo, surfaces, eBuiltInOps, numEventsInWaitList, eventWaitList, event, blockingWrite, csr);

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2018-2022 Intel Corporation
* Copyright (C) 2018-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -105,6 +105,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueWriteBufferRect(
dc.dstRowPitch = bufferRowPitch;
dc.dstSlicePitch = bufferSlicePitch;
dc.bcsSplit = bcsSplit;
dc.direction = csrSelectionArgs.direction;
MultiDispatchInfo dispatchInfo(dc);
const auto dispatchResult = dispatchBcsOrGpgpuEnqueue<CL_COMMAND_WRITE_BUFFER_RECT>(dispatchInfo, surfaces, eBuiltInOps, numEventsInWaitList, eventWaitList, event, blockingWrite, csr);

View File

@@ -98,6 +98,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueWriteImage(
}
dc.transferAllocation = mapAllocation ? mapAllocation : hostPtrSurf.getAllocation();
dc.bcsSplit = bcsSplit;
dc.direction = csrSelectionArgs.direction;
auto eBuiltInOps = EBuiltInOps::CopyBufferToImage3d;
MultiDispatchInfo dispatchInfo(dc);