Add initial enqueue bcs split infrastructure
Signed-off-by: Lukasz Jobczyk <lukasz.jobczyk@intel.com>
This commit is contained in:
parent
61242205e0
commit
b10b3ed9dd
|
@ -48,6 +48,7 @@ struct BuiltinOpParams {
|
|||
uint32_t srcMipLevel = 0;
|
||||
uint32_t dstMipLevel = 0;
|
||||
void *userPtrForPostOperationCpuCopy = nullptr;
|
||||
bool bcsSplit = false;
|
||||
};
|
||||
|
||||
class BuiltinDispatchInfoBuilder {
|
||||
|
|
|
@ -113,10 +113,7 @@ CommandQueue::~CommandQueue() {
|
|||
device->getPerformanceCounters()->shutdown();
|
||||
}
|
||||
|
||||
if (auto mainBcs = bcsEngines[0]; mainBcs != nullptr) {
|
||||
auto &selectorCopyEngine = device->getNearestGenericSubDevice(0)->getSelectorCopyEngine();
|
||||
EngineHelpers::releaseBcsEngineType(mainBcs->getEngineType(), selectorCopyEngine);
|
||||
}
|
||||
this->releaseMainCopyEngine();
|
||||
}
|
||||
|
||||
timestampPacketContainer.reset();
|
||||
|
@ -307,6 +304,47 @@ void CommandQueue::initializeBcsEngine(bool internalUsage) {
|
|||
constructBcsEngine(internalUsage);
|
||||
}
|
||||
|
||||
void CommandQueue::constructBcsEnginesForSplit() {
|
||||
if (!this->bcsSplitInitialized) {
|
||||
for (auto i = static_cast<uint32_t>(aub_stream::EngineType::ENGINE_BCS2); i <= static_cast<uint32_t>(aub_stream::EngineType::ENGINE_BCS8); i += 2) {
|
||||
auto index = EngineHelpers::getBcsIndex(static_cast<aub_stream::EngineType>(i));
|
||||
if (!bcsEngines[index]) {
|
||||
auto &neoDevice = device->getNearestGenericSubDevice(0)->getDevice();
|
||||
bcsEngines[index] = neoDevice.tryGetEngine(static_cast<aub_stream::EngineType>(i), EngineUsage::Regular);
|
||||
bcsEngineTypes.push_back(static_cast<aub_stream::EngineType>(i));
|
||||
if (bcsEngines[index]) {
|
||||
bcsEngines[index]->osContext->ensureContextInitialized();
|
||||
bcsEngines[index]->commandStreamReceiver->initDirectSubmission();
|
||||
}
|
||||
}
|
||||
}
|
||||
this->bcsSplitInitialized = true;
|
||||
}
|
||||
}
|
||||
|
||||
void CommandQueue::prepareHostPtrSurfaceForSplit(bool split, GraphicsAllocation &allocation) {
|
||||
if (split) {
|
||||
for (const auto bcsEngine : this->bcsEngines) {
|
||||
if (bcsEngine) {
|
||||
if (allocation.getTaskCount(bcsEngine->commandStreamReceiver->getOsContext().getContextId()) == GraphicsAllocation::objectNotUsed) {
|
||||
allocation.updateTaskCount(0u, bcsEngine->commandStreamReceiver->getOsContext().getContextId());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
CommandStreamReceiver &CommandQueue::selectCsrForHostPtrAllocation(bool split, CommandStreamReceiver &csr) {
|
||||
return split ? getGpgpuCommandStreamReceiver() : csr;
|
||||
}
|
||||
|
||||
void CommandQueue::releaseMainCopyEngine() {
|
||||
if (auto mainBcs = bcsEngines[0]; mainBcs != nullptr) {
|
||||
auto &selectorCopyEngine = device->getNearestGenericSubDevice(0)->getSelectorCopyEngine();
|
||||
EngineHelpers::releaseBcsEngineType(mainBcs->getEngineType(), selectorCopyEngine);
|
||||
}
|
||||
}
|
||||
|
||||
Device &CommandQueue::getDevice() const noexcept {
|
||||
return device->getDevice();
|
||||
}
|
||||
|
|
|
@ -229,6 +229,10 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
|
|||
MOCKABLE_VIRTUAL CommandStreamReceiver &selectCsrForBuiltinOperation(const CsrSelectionArgs &args);
|
||||
void constructBcsEngine(bool internalUsage);
|
||||
MOCKABLE_VIRTUAL void initializeBcsEngine(bool internalUsage);
|
||||
void constructBcsEnginesForSplit();
|
||||
void prepareHostPtrSurfaceForSplit(bool split, GraphicsAllocation &allocation);
|
||||
CommandStreamReceiver &selectCsrForHostPtrAllocation(bool split, CommandStreamReceiver &csr);
|
||||
void releaseMainCopyEngine();
|
||||
Device &getDevice() const noexcept;
|
||||
ClDevice &getClDevice() const { return *device; }
|
||||
Context &getContext() const { return *context; }
|
||||
|
@ -416,6 +420,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
|
|||
bool isCopyOnly = false;
|
||||
bool bcsAllowed = false;
|
||||
bool bcsInitialized = false;
|
||||
bool bcsSplitInitialized = false;
|
||||
|
||||
LinearStream *commandStream = nullptr;
|
||||
|
||||
|
|
|
@ -357,6 +357,12 @@ class CommandQueueHw : public CommandQueue {
|
|||
template <uint32_t cmdType>
|
||||
cl_int enqueueBlit(const MultiDispatchInfo &multiDispatchInfo, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event, bool blocking, CommandStreamReceiver &bcsCsr);
|
||||
|
||||
bool isSplitEnqueueBlitSupported();
|
||||
bool isSplitEnqueueBlitNeeded(TransferDirection transferDirection, CommandStreamReceiver &csr);
|
||||
|
||||
template <uint32_t cmdType>
|
||||
cl_int enqueueBlitSplit(MultiDispatchInfo &dispatchInfo, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event, bool blocking, CommandStreamReceiver &csr);
|
||||
|
||||
template <uint32_t commandType>
|
||||
CompletionStamp enqueueNonBlocked(Surface **surfacesForResidency,
|
||||
size_t surfaceCount,
|
||||
|
|
|
@ -1103,6 +1103,105 @@ size_t CommandQueueHw<GfxFamily>::calculateHostPtrSizeForImage(const size_t *reg
|
|||
return Image::calculateHostPtrSize(region, dstRowPitch, dstSlicePitch, bytesPerPixel, image->getImageDesc().image_type);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
bool CommandQueueHw<GfxFamily>::isSplitEnqueueBlitSupported() {
|
||||
auto bcsSplit = HwInfoConfig::get(getDevice().getHardwareInfo().platform.eProductFamily)->isBlitSplitEnqueueWARequired(getDevice().getHardwareInfo());
|
||||
|
||||
if (DebugManager.flags.SplitBcsCopy.get() != -1) {
|
||||
bcsSplit = DebugManager.flags.SplitBcsCopy.get();
|
||||
}
|
||||
|
||||
return bcsSplit;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
bool CommandQueueHw<GfxFamily>::isSplitEnqueueBlitNeeded(TransferDirection transferDirection, CommandStreamReceiver &csr) {
|
||||
auto bcsSplit = isSplitEnqueueBlitSupported() &&
|
||||
csr.getOsContext().getEngineType() == aub_stream::EngineType::ENGINE_BCS &&
|
||||
(transferDirection == TransferDirection::HostToLocal ||
|
||||
transferDirection == TransferDirection::LocalToHost);
|
||||
|
||||
if (bcsSplit) {
|
||||
this->constructBcsEnginesForSplit();
|
||||
}
|
||||
|
||||
return bcsSplit;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
template <uint32_t cmdType>
|
||||
cl_int CommandQueueHw<GfxFamily>::enqueueBlitSplit(MultiDispatchInfo &dispatchInfo, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event, bool blocking, CommandStreamReceiver &csr) {
|
||||
auto ret = CL_SUCCESS;
|
||||
this->releaseMainCopyEngine();
|
||||
|
||||
StackVec<std::unique_lock<CommandStreamReceiver::MutexType>, 3u> locks;
|
||||
StackVec<CommandStreamReceiver *, 3u> copyEngines;
|
||||
for (auto i = static_cast<uint32_t>(aub_stream::EngineType::ENGINE_BCS2); i <= static_cast<uint32_t>(aub_stream::EngineType::ENGINE_BCS8); i += 2) {
|
||||
auto bcs = getBcsCommandStreamReceiver(static_cast<aub_stream::EngineType>(i));
|
||||
if (bcs) {
|
||||
locks.push_back(std::move(bcs->obtainUniqueOwnership()));
|
||||
copyEngines.push_back(bcs);
|
||||
}
|
||||
}
|
||||
DEBUG_BREAK_IF(copyEngines.size() == 0);
|
||||
TakeOwnershipWrapper<CommandQueueHw<GfxFamily>> queueOwnership(*this);
|
||||
|
||||
TimestampPacketContainer splitNodes;
|
||||
TimestampPacketContainer previousEnqueueNode;
|
||||
previousEnqueueNode.swapNodes(*this->timestampPacketContainer);
|
||||
|
||||
auto srcOffset = dispatchInfo.peekBuiltinOpParams().srcOffset;
|
||||
auto dstOffset = dispatchInfo.peekBuiltinOpParams().dstOffset;
|
||||
auto size = dispatchInfo.peekBuiltinOpParams().size;
|
||||
auto remainingSize = size;
|
||||
|
||||
for (size_t i = 0; i < copyEngines.size(); i++) {
|
||||
auto localSizeX = remainingSize.x / (copyEngines.size() - i);
|
||||
auto localSizeY = remainingSize.y / (copyEngines.size() - i);
|
||||
auto localSizeZ = remainingSize.z / (copyEngines.size() - i);
|
||||
|
||||
auto localParams = dispatchInfo.peekBuiltinOpParams();
|
||||
|
||||
localParams.size.x = localSizeX;
|
||||
localParams.size.y = localSizeY;
|
||||
localParams.size.z = localSizeZ;
|
||||
|
||||
localParams.srcOffset.x = (srcOffset.x + size.x - remainingSize.x);
|
||||
localParams.srcOffset.y = (srcOffset.y + size.y - remainingSize.y);
|
||||
localParams.srcOffset.z = (srcOffset.z + size.z - remainingSize.z);
|
||||
|
||||
localParams.dstOffset.x = (dstOffset.x + size.x - remainingSize.x);
|
||||
localParams.dstOffset.y = (dstOffset.y + size.y - remainingSize.y);
|
||||
localParams.dstOffset.z = (dstOffset.z + size.z - remainingSize.z);
|
||||
|
||||
dispatchInfo.setBuiltinOpParams(localParams);
|
||||
|
||||
remainingSize.x -= localSizeX;
|
||||
remainingSize.y -= localSizeY;
|
||||
remainingSize.z -= localSizeZ;
|
||||
|
||||
this->timestampPacketContainer->assignAndIncrementNodesRefCounts(previousEnqueueNode);
|
||||
|
||||
ret = enqueueBlit<cmdType>(dispatchInfo, numEventsInWaitList, eventWaitList, remainingSize == 0 ? event : nullptr, false, *copyEngines[i]);
|
||||
DEBUG_BREAK_IF(ret != CL_SUCCESS);
|
||||
|
||||
this->timestampPacketContainer->moveNodesToNewContainer(splitNodes);
|
||||
}
|
||||
|
||||
if (event) {
|
||||
auto e = castToObjectOrAbort<Event>(*event);
|
||||
e->addTimestampPacketNodes(splitNodes);
|
||||
}
|
||||
|
||||
this->timestampPacketContainer->swapNodes(splitNodes);
|
||||
|
||||
if (blocking) {
|
||||
ret = this->finish();
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
template <uint32_t cmdType>
|
||||
cl_int CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDispatchInfo, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event, bool blocking, CommandStreamReceiver &bcsCsr) {
|
||||
|
@ -1236,7 +1335,15 @@ cl_int CommandQueueHw<GfxFamily>::dispatchBcsOrGpgpuEnqueue(MultiDispatchInfo &d
|
|||
const bool blit = EngineHelpers::isBcs(csr.getOsContext().getEngineType());
|
||||
|
||||
if (blit) {
|
||||
return enqueueBlit<cmdType>(dispatchInfo, numEventsInWaitList, eventWaitList, event, blocking, csr);
|
||||
cl_int ret = CL_SUCCESS;
|
||||
|
||||
if (dispatchInfo.peekBuiltinOpParams().bcsSplit) {
|
||||
ret = enqueueBlitSplit<cmdType>(dispatchInfo, numEventsInWaitList, eventWaitList, event, blocking, csr);
|
||||
} else {
|
||||
ret = enqueueBlit<cmdType>(dispatchInfo, numEventsInWaitList, eventWaitList, event, blocking, csr);
|
||||
}
|
||||
|
||||
return ret;
|
||||
} else {
|
||||
auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(builtInOperation,
|
||||
this->getClDevice());
|
||||
|
|
|
@ -45,6 +45,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueCopyBuffer(
|
|||
dc.srcOffset = {srcOffset, 0, 0};
|
||||
dc.dstOffset = {dstOffset, 0, 0};
|
||||
dc.size = {size, 0, 0};
|
||||
dc.bcsSplit = this->isSplitEnqueueBlitNeeded(csrSelectionArgs.direction, csr);
|
||||
|
||||
MultiDispatchInfo dispatchInfo(dc);
|
||||
|
||||
|
|
|
@ -56,6 +56,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueCopyBufferRect(
|
|||
dc.srcSlicePitch = srcSlicePitch;
|
||||
dc.dstRowPitch = dstRowPitch;
|
||||
dc.dstSlicePitch = dstSlicePitch;
|
||||
dc.bcsSplit = this->isSplitEnqueueBlitNeeded(csrSelectionArgs.direction, csr);
|
||||
|
||||
MultiDispatchInfo dispatchInfo(dc);
|
||||
return dispatchBcsOrGpgpuEnqueue<CL_COMMAND_COPY_BUFFER_RECT>(dispatchInfo, surfaces, eBuiltInOps, numEventsInWaitList, eventWaitList, event, false, csr);
|
||||
|
|
|
@ -52,6 +52,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueCopyImage(
|
|||
if (isMipMapped(dstImage->getImageDesc())) {
|
||||
dc.dstMipLevel = findMipLevel(dstImage->getImageDesc().image_type, dstOrigin);
|
||||
}
|
||||
dc.bcsSplit = this->isSplitEnqueueBlitNeeded(csrSelectionArgs.direction, csr);
|
||||
|
||||
MultiDispatchInfo dispatchInfo(dc);
|
||||
|
||||
|
|
|
@ -82,6 +82,8 @@ cl_int CommandQueueHw<GfxFamily>::enqueueReadBuffer(
|
|||
GeneralSurface mapSurface;
|
||||
Surface *surfaces[] = {&bufferSurf, nullptr};
|
||||
|
||||
auto bcsSplit = this->isSplitEnqueueBlitNeeded(csrSelectionArgs.direction, csr);
|
||||
|
||||
if (mapAllocation) {
|
||||
surfaces[1] = &mapSurface;
|
||||
mapSurface.setGraphicsAllocation(mapAllocation);
|
||||
|
@ -89,10 +91,11 @@ cl_int CommandQueueHw<GfxFamily>::enqueueReadBuffer(
|
|||
} else {
|
||||
surfaces[1] = &hostPtrSurf;
|
||||
if (size != 0) {
|
||||
bool status = csr.createAllocationForHostSurface(hostPtrSurf, true);
|
||||
bool status = selectCsrForHostPtrAllocation(bcsSplit, csr).createAllocationForHostSurface(hostPtrSurf, true);
|
||||
if (!status) {
|
||||
return CL_OUT_OF_RESOURCES;
|
||||
}
|
||||
this->prepareHostPtrSurfaceForSplit(bcsSplit, *hostPtrSurf.getAllocation());
|
||||
dstPtr = reinterpret_cast<void *>(hostPtrSurf.getAllocation()->getGpuAddress());
|
||||
}
|
||||
}
|
||||
|
@ -106,6 +109,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueReadBuffer(
|
|||
dc.srcOffset = {offset, 0, 0};
|
||||
dc.size = {size, 0, 0};
|
||||
dc.transferAllocation = mapAllocation ? mapAllocation : hostPtrSurf.getAllocation();
|
||||
dc.bcsSplit = bcsSplit;
|
||||
|
||||
MultiDispatchInfo dispatchInfo(dc);
|
||||
|
||||
|
|
|
@ -66,6 +66,8 @@ cl_int CommandQueueHw<GfxFamily>::enqueueReadBufferRect(
|
|||
GeneralSurface mapSurface;
|
||||
Surface *surfaces[] = {&srcBufferSurf, nullptr};
|
||||
|
||||
auto bcsSplit = this->isSplitEnqueueBlitNeeded(csrSelectionArgs.direction, csr);
|
||||
|
||||
if (region[0] != 0 && region[1] != 0 && region[2] != 0) {
|
||||
if (mapAllocation) {
|
||||
surfaces[1] = &mapSurface;
|
||||
|
@ -73,10 +75,11 @@ cl_int CommandQueueHw<GfxFamily>::enqueueReadBufferRect(
|
|||
dstPtr = convertAddressWithOffsetToGpuVa(dstPtr, memoryType, *mapAllocation);
|
||||
} else {
|
||||
surfaces[1] = &hostPtrSurf;
|
||||
bool status = csr.createAllocationForHostSurface(hostPtrSurf, true);
|
||||
bool status = selectCsrForHostPtrAllocation(bcsSplit, csr).createAllocationForHostSurface(hostPtrSurf, true);
|
||||
if (!status) {
|
||||
return CL_OUT_OF_RESOURCES;
|
||||
}
|
||||
this->prepareHostPtrSurfaceForSplit(bcsSplit, *hostPtrSurf.getAllocation());
|
||||
dstPtr = reinterpret_cast<void *>(hostPtrSurf.getAllocation()->getGpuAddress());
|
||||
}
|
||||
}
|
||||
|
@ -96,6 +99,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueReadBufferRect(
|
|||
dc.srcSlicePitch = bufferSlicePitch;
|
||||
dc.dstRowPitch = hostRowPitch;
|
||||
dc.dstSlicePitch = hostSlicePitch;
|
||||
dc.bcsSplit = bcsSplit;
|
||||
|
||||
MultiDispatchInfo dispatchInfo(dc);
|
||||
const auto dispatchResult = dispatchBcsOrGpgpuEnqueue<CL_COMMAND_READ_BUFFER_RECT>(dispatchInfo, surfaces, eBuiltInOps, numEventsInWaitList, eventWaitList, event, blockingRead, csr);
|
||||
|
|
|
@ -63,6 +63,8 @@ cl_int CommandQueueHw<GfxFamily>::enqueueReadImage(
|
|||
GeneralSurface mapSurface;
|
||||
Surface *surfaces[] = {&srcImgSurf, nullptr};
|
||||
|
||||
auto bcsSplit = this->isSplitEnqueueBlitNeeded(csrSelectionArgs.direction, csr);
|
||||
|
||||
bool tempAllocFallback = false;
|
||||
if (mapAllocation) {
|
||||
surfaces[1] = &mapSurface;
|
||||
|
@ -75,11 +77,11 @@ cl_int CommandQueueHw<GfxFamily>::enqueueReadImage(
|
|||
if (region[0] != 0 &&
|
||||
region[1] != 0 &&
|
||||
region[2] != 0) {
|
||||
bool status = csr.createAllocationForHostSurface(hostPtrSurf, true);
|
||||
bool status = selectCsrForHostPtrAllocation(bcsSplit, csr).createAllocationForHostSurface(hostPtrSurf, true);
|
||||
if (!status) {
|
||||
if (CL_TRUE == blockingRead) {
|
||||
hostPtrSurf.setIsPtrCopyAllowed(true);
|
||||
status = csr.createAllocationForHostSurface(hostPtrSurf, true);
|
||||
status = selectCsrForHostPtrAllocation(bcsSplit, csr).createAllocationForHostSurface(hostPtrSurf, true);
|
||||
if (!status) {
|
||||
return CL_OUT_OF_RESOURCES;
|
||||
}
|
||||
|
@ -89,6 +91,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueReadImage(
|
|||
}
|
||||
}
|
||||
dstPtr = reinterpret_cast<void *>(hostPtrSurf.getAllocation()->getGpuAddress());
|
||||
this->prepareHostPtrSurfaceForSplit(bcsSplit, *hostPtrSurf.getAllocation());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -110,6 +113,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueReadImage(
|
|||
if (tempAllocFallback) {
|
||||
dc.userPtrForPostOperationCpuCopy = ptr;
|
||||
}
|
||||
dc.bcsSplit = bcsSplit;
|
||||
|
||||
auto eBuiltInOps = EBuiltInOps::CopyImage3dToBuffer;
|
||||
MultiDispatchInfo dispatchInfo(dc);
|
||||
|
|
|
@ -123,6 +123,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueSVMMap(cl_bool blockingMap,
|
|||
dc.srcOffset = {svmOffset, 0, 0};
|
||||
dc.size = {size, 0, 0};
|
||||
dc.unifiedMemoryArgsRequireMemSync = externalAppCall;
|
||||
dc.bcsSplit = this->isSplitEnqueueBlitNeeded(csrSelectionArgs.direction, csr);
|
||||
|
||||
MultiDispatchInfo dispatchInfo(dc);
|
||||
const auto dispatchResult = dispatchBcsOrGpgpuEnqueue<CL_COMMAND_READ_BUFFER>(dispatchInfo, surfaces, EBuiltInOps::CopyBufferToBuffer, numEventsInWaitList, eventWaitList, event, blocking, csr);
|
||||
|
@ -209,6 +210,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueSVMUnmap(void *svmPtr,
|
|||
dc.srcOffset = {svmOperation->offset, 0, 0};
|
||||
dc.size = {svmOperation->regionSize, 0, 0};
|
||||
dc.unifiedMemoryArgsRequireMemSync = externalAppCall;
|
||||
dc.bcsSplit = this->isSplitEnqueueBlitNeeded(csrSelectionArgs.direction, csr);
|
||||
|
||||
MultiDispatchInfo dispatchInfo(dc);
|
||||
const auto dispatchResult = dispatchBcsOrGpgpuEnqueue<CL_COMMAND_READ_BUFFER>(dispatchInfo, surfaces, EBuiltInOps::CopyBufferToBuffer, numEventsInWaitList, eventWaitList, event, false, csr);
|
||||
|
@ -364,18 +366,24 @@ cl_int CommandQueueHw<GfxFamily>::enqueueSVMMemcpy(cl_bool blockingCopy,
|
|||
|
||||
GeneralSurface srcSvmSurf(srcAllocation);
|
||||
HostPtrSurface dstHostPtrSurf(dstGpuPtr, size);
|
||||
|
||||
auto bcsSplit = this->isSplitEnqueueBlitNeeded(csrSelectionArgs.direction, csr);
|
||||
|
||||
if (size != 0) {
|
||||
bool status = csr.createAllocationForHostSurface(dstHostPtrSurf, true);
|
||||
bool status = selectCsrForHostPtrAllocation(bcsSplit, csr).createAllocationForHostSurface(dstHostPtrSurf, true);
|
||||
if (!status) {
|
||||
return CL_OUT_OF_RESOURCES;
|
||||
}
|
||||
dstGpuPtr = reinterpret_cast<void *>(dstHostPtrSurf.getAllocation()->getGpuAddress());
|
||||
this->prepareHostPtrSurfaceForSplit(bcsSplit, *dstHostPtrSurf.getAllocation());
|
||||
|
||||
notifyEnqueueSVMMemcpy(srcAllocation, !!blockingCopy, EngineHelpers::isBcs(csr.getOsContext().getEngineType()));
|
||||
}
|
||||
setOperationParams(operationParams, size, srcGpuPtr, srcAllocation, dstGpuPtr, dstHostPtrSurf.getAllocation());
|
||||
surfaces[0] = &srcSvmSurf;
|
||||
surfaces[1] = &dstHostPtrSurf;
|
||||
|
||||
operationParams.bcsSplit = bcsSplit;
|
||||
dispatchInfo.setBuiltinOpParams(operationParams);
|
||||
dispatchResult = dispatchBcsOrGpgpuEnqueue<CL_COMMAND_READ_BUFFER>(dispatchInfo, surfaces, builtInType, numEventsInWaitList, eventWaitList, event, blockingCopy, csr);
|
||||
} else if (copyType == HostToSvm) {
|
||||
|
@ -384,17 +392,22 @@ cl_int CommandQueueHw<GfxFamily>::enqueueSVMMemcpy(cl_bool blockingCopy,
|
|||
|
||||
HostPtrSurface srcHostPtrSurf(const_cast<void *>(srcGpuPtr), size, true);
|
||||
GeneralSurface dstSvmSurf(dstAllocation);
|
||||
|
||||
auto bcsSplit = this->isSplitEnqueueBlitNeeded(csrSelectionArgs.direction, csr);
|
||||
|
||||
if (size != 0) {
|
||||
bool status = csr.createAllocationForHostSurface(srcHostPtrSurf, false);
|
||||
bool status = selectCsrForHostPtrAllocation(bcsSplit, csr).createAllocationForHostSurface(srcHostPtrSurf, false);
|
||||
if (!status) {
|
||||
return CL_OUT_OF_RESOURCES;
|
||||
}
|
||||
srcGpuPtr = reinterpret_cast<void *>(srcHostPtrSurf.getAllocation()->getGpuAddress());
|
||||
this->prepareHostPtrSurfaceForSplit(bcsSplit, *srcHostPtrSurf.getAllocation());
|
||||
}
|
||||
setOperationParams(operationParams, size, srcGpuPtr, srcHostPtrSurf.getAllocation(), dstGpuPtr, dstAllocation);
|
||||
surfaces[0] = &dstSvmSurf;
|
||||
surfaces[1] = &srcHostPtrSurf;
|
||||
|
||||
operationParams.bcsSplit = bcsSplit;
|
||||
dispatchInfo.setBuiltinOpParams(operationParams);
|
||||
dispatchResult = dispatchBcsOrGpgpuEnqueue<CL_COMMAND_WRITE_BUFFER>(dispatchInfo, surfaces, builtInType, numEventsInWaitList, eventWaitList, event, blockingCopy, csr);
|
||||
} else if (copyType == SvmToSvm) {
|
||||
|
@ -407,6 +420,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueSVMMemcpy(cl_bool blockingCopy,
|
|||
surfaces[0] = &srcSvmSurf;
|
||||
surfaces[1] = &dstSvmSurf;
|
||||
|
||||
operationParams.bcsSplit = this->isSplitEnqueueBlitNeeded(csrSelectionArgs.direction, csr);
|
||||
dispatchInfo.setBuiltinOpParams(operationParams);
|
||||
dispatchResult = dispatchBcsOrGpgpuEnqueue<CL_COMMAND_SVM_MEMCPY>(dispatchInfo, surfaces, builtInType, numEventsInWaitList, eventWaitList, event, blockingCopy, csr);
|
||||
} else {
|
||||
|
@ -415,19 +429,25 @@ cl_int CommandQueueHw<GfxFamily>::enqueueSVMMemcpy(cl_bool blockingCopy,
|
|||
|
||||
HostPtrSurface srcHostPtrSurf(const_cast<void *>(srcGpuPtr), size);
|
||||
HostPtrSurface dstHostPtrSurf(dstGpuPtr, size);
|
||||
|
||||
auto bcsSplit = this->isSplitEnqueueBlitNeeded(csrSelectionArgs.direction, csr);
|
||||
|
||||
if (size != 0) {
|
||||
bool status = csr.createAllocationForHostSurface(srcHostPtrSurf, false);
|
||||
status &= csr.createAllocationForHostSurface(dstHostPtrSurf, true);
|
||||
bool status = selectCsrForHostPtrAllocation(bcsSplit, csr).createAllocationForHostSurface(srcHostPtrSurf, false);
|
||||
status &= selectCsrForHostPtrAllocation(bcsSplit, csr).createAllocationForHostSurface(dstHostPtrSurf, true);
|
||||
if (!status) {
|
||||
return CL_OUT_OF_RESOURCES;
|
||||
}
|
||||
srcGpuPtr = reinterpret_cast<void *>(srcHostPtrSurf.getAllocation()->getGpuAddress());
|
||||
dstGpuPtr = reinterpret_cast<void *>(dstHostPtrSurf.getAllocation()->getGpuAddress());
|
||||
this->prepareHostPtrSurfaceForSplit(bcsSplit, *srcHostPtrSurf.getAllocation());
|
||||
this->prepareHostPtrSurfaceForSplit(bcsSplit, *dstHostPtrSurf.getAllocation());
|
||||
}
|
||||
setOperationParams(operationParams, size, srcGpuPtr, srcHostPtrSurf.getAllocation(), dstGpuPtr, dstHostPtrSurf.getAllocation());
|
||||
surfaces[0] = &srcHostPtrSurf;
|
||||
surfaces[1] = &dstHostPtrSurf;
|
||||
|
||||
operationParams.bcsSplit = bcsSplit;
|
||||
dispatchInfo.setBuiltinOpParams(operationParams);
|
||||
dispatchResult = dispatchBcsOrGpgpuEnqueue<CL_COMMAND_WRITE_BUFFER>(dispatchInfo, surfaces, builtInType, numEventsInWaitList, eventWaitList, event, blockingCopy, csr);
|
||||
}
|
||||
|
|
|
@ -70,6 +70,8 @@ cl_int CommandQueueHw<GfxFamily>::enqueueWriteBuffer(
|
|||
GeneralSurface mapSurface;
|
||||
Surface *surfaces[] = {&bufferSurf, nullptr};
|
||||
|
||||
auto bcsSplit = this->isSplitEnqueueBlitNeeded(csrSelectionArgs.direction, csr);
|
||||
|
||||
if (mapAllocation) {
|
||||
surfaces[1] = &mapSurface;
|
||||
mapSurface.setGraphicsAllocation(mapAllocation);
|
||||
|
@ -77,10 +79,12 @@ cl_int CommandQueueHw<GfxFamily>::enqueueWriteBuffer(
|
|||
} else {
|
||||
surfaces[1] = &hostPtrSurf;
|
||||
if (size != 0) {
|
||||
bool status = csr.createAllocationForHostSurface(hostPtrSurf, false);
|
||||
bool status = selectCsrForHostPtrAllocation(bcsSplit, csr).createAllocationForHostSurface(hostPtrSurf, false);
|
||||
if (!status) {
|
||||
return CL_OUT_OF_RESOURCES;
|
||||
}
|
||||
this->prepareHostPtrSurfaceForSplit(bcsSplit, *hostPtrSurf.getAllocation());
|
||||
|
||||
srcPtr = reinterpret_cast<void *>(hostPtrSurf.getAllocation()->getGpuAddress());
|
||||
}
|
||||
}
|
||||
|
@ -94,6 +98,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueWriteBuffer(
|
|||
dc.dstOffset = {offset, 0, 0};
|
||||
dc.size = {size, 0, 0};
|
||||
dc.transferAllocation = mapAllocation ? mapAllocation : hostPtrSurf.getAllocation();
|
||||
dc.bcsSplit = bcsSplit;
|
||||
|
||||
MultiDispatchInfo dispatchInfo(dc);
|
||||
const auto dispatchResult = dispatchBcsOrGpgpuEnqueue<CL_COMMAND_WRITE_BUFFER>(dispatchInfo, surfaces, eBuiltInOps, numEventsInWaitList, eventWaitList, event, blockingWrite, csr);
|
||||
|
|
|
@ -70,6 +70,8 @@ cl_int CommandQueueHw<GfxFamily>::enqueueWriteBufferRect(
|
|||
GeneralSurface mapSurface;
|
||||
Surface *surfaces[] = {&dstBufferSurf, nullptr};
|
||||
|
||||
auto bcsSplit = this->isSplitEnqueueBlitNeeded(csrSelectionArgs.direction, csr);
|
||||
|
||||
if (region[0] != 0 && region[1] != 0 && region[2] != 0) {
|
||||
if (mapAllocation) {
|
||||
surfaces[1] = &mapSurface;
|
||||
|
@ -77,10 +79,12 @@ cl_int CommandQueueHw<GfxFamily>::enqueueWriteBufferRect(
|
|||
srcPtr = convertAddressWithOffsetToGpuVa(srcPtr, memoryType, *mapAllocation);
|
||||
} else {
|
||||
surfaces[1] = &hostPtrSurf;
|
||||
bool status = csr.createAllocationForHostSurface(hostPtrSurf, false);
|
||||
bool status = selectCsrForHostPtrAllocation(bcsSplit, csr).createAllocationForHostSurface(hostPtrSurf, false);
|
||||
if (!status) {
|
||||
return CL_OUT_OF_RESOURCES;
|
||||
}
|
||||
this->prepareHostPtrSurfaceForSplit(bcsSplit, *hostPtrSurf.getAllocation());
|
||||
|
||||
srcPtr = reinterpret_cast<void *>(hostPtrSurf.getAllocation()->getGpuAddress());
|
||||
}
|
||||
}
|
||||
|
@ -100,6 +104,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueWriteBufferRect(
|
|||
dc.srcSlicePitch = hostSlicePitch;
|
||||
dc.dstRowPitch = bufferRowPitch;
|
||||
dc.dstSlicePitch = bufferSlicePitch;
|
||||
dc.bcsSplit = bcsSplit;
|
||||
|
||||
MultiDispatchInfo dispatchInfo(dc);
|
||||
const auto dispatchResult = dispatchBcsOrGpgpuEnqueue<CL_COMMAND_WRITE_BUFFER_RECT>(dispatchInfo, surfaces, eBuiltInOps, numEventsInWaitList, eventWaitList, event, blockingWrite, csr);
|
||||
|
|
|
@ -58,6 +58,9 @@ cl_int CommandQueueHw<GfxFamily>::enqueueWriteImage(
|
|||
HostPtrSurface hostPtrSurf(srcPtr, hostPtrSize, true);
|
||||
GeneralSurface mapSurface;
|
||||
Surface *surfaces[] = {&dstImgSurf, nullptr};
|
||||
|
||||
auto bcsSplit = this->isSplitEnqueueBlitNeeded(csrSelectionArgs.direction, csr);
|
||||
|
||||
if (mapAllocation) {
|
||||
surfaces[1] = &mapSurface;
|
||||
mapSurface.setGraphicsAllocation(mapAllocation);
|
||||
|
@ -69,10 +72,12 @@ cl_int CommandQueueHw<GfxFamily>::enqueueWriteImage(
|
|||
if (region[0] != 0 &&
|
||||
region[1] != 0 &&
|
||||
region[2] != 0) {
|
||||
bool status = csr.createAllocationForHostSurface(hostPtrSurf, false);
|
||||
bool status = selectCsrForHostPtrAllocation(bcsSplit, csr).createAllocationForHostSurface(hostPtrSurf, false);
|
||||
if (!status) {
|
||||
return CL_OUT_OF_RESOURCES;
|
||||
}
|
||||
this->prepareHostPtrSurfaceForSplit(bcsSplit, *hostPtrSurf.getAllocation());
|
||||
|
||||
srcPtr = reinterpret_cast<void *>(hostPtrSurf.getAllocation()->getGpuAddress());
|
||||
}
|
||||
}
|
||||
|
@ -92,6 +97,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueWriteImage(
|
|||
dc.dstMipLevel = findMipLevel(dstImage->getImageDesc().image_type, origin);
|
||||
}
|
||||
dc.transferAllocation = mapAllocation ? mapAllocation : hostPtrSurf.getAllocation();
|
||||
dc.bcsSplit = bcsSplit;
|
||||
|
||||
auto eBuiltInOps = EBuiltInOps::CopyBufferToImage3d;
|
||||
MultiDispatchInfo dispatchInfo(dc);
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
*/
|
||||
|
||||
#include "shared/test/common/cmd_parse/hw_parse.h"
|
||||
#include "shared/test/common/helpers/engine_descriptor_helper.h"
|
||||
#include "shared/test/common/mocks/mock_builtins.h"
|
||||
#include "shared/test/common/mocks/mock_csr.h"
|
||||
#include "shared/test/common/utilities/base_object_utils.h"
|
||||
|
@ -288,6 +289,191 @@ HWTEST_F(OOQueueHwTest, givenBlockedOutOfOrderCmdQueueAndAsynchronouslyCompleted
|
|||
EXPECT_EQ(virtualEventTaskLevel + 1, mockCSR->lastTaskLevelToFlushTask);
|
||||
}
|
||||
|
||||
HWTEST_F(IoqCommandQueueHwBlitTest, givenSplitBcsCopyWhenCheckIsSplitEnqueueBlitSupportedThenReturnProperValue) {
|
||||
DebugManagerStateRestore restorer;
|
||||
auto *cmdQHw = static_cast<CommandQueueHw<FamilyType> *>(this->pCmdQ);
|
||||
{
|
||||
DebugManager.flags.SplitBcsCopy.set(1);
|
||||
EXPECT_TRUE(cmdQHw->isSplitEnqueueBlitSupported());
|
||||
}
|
||||
{
|
||||
DebugManager.flags.SplitBcsCopy.set(0);
|
||||
EXPECT_FALSE(cmdQHw->isSplitEnqueueBlitSupported());
|
||||
}
|
||||
}
|
||||
|
||||
HWTEST_F(IoqCommandQueueHwBlitTest, givenSplitBcsCopyWhenCheckIsSplitEnqueueBlitNeededThenReturnProperValue) {
|
||||
DebugManagerStateRestore restorer;
|
||||
DebugManager.flags.SplitBcsCopy.set(1);
|
||||
auto *cmdQHw = static_cast<CommandQueueHw<FamilyType> *>(this->pCmdQ);
|
||||
{
|
||||
EXPECT_FALSE(cmdQHw->isSplitEnqueueBlitNeeded(TransferDirection::HostToHost, *cmdQHw->getBcsCommandStreamReceiver(aub_stream::EngineType::ENGINE_BCS)));
|
||||
EXPECT_FALSE(cmdQHw->isSplitEnqueueBlitNeeded(TransferDirection::LocalToLocal, *cmdQHw->getBcsCommandStreamReceiver(aub_stream::EngineType::ENGINE_BCS)));
|
||||
EXPECT_TRUE(cmdQHw->isSplitEnqueueBlitNeeded(TransferDirection::LocalToHost, *cmdQHw->getBcsCommandStreamReceiver(aub_stream::EngineType::ENGINE_BCS)));
|
||||
EXPECT_TRUE(cmdQHw->isSplitEnqueueBlitNeeded(TransferDirection::HostToLocal, *cmdQHw->getBcsCommandStreamReceiver(aub_stream::EngineType::ENGINE_BCS)));
|
||||
}
|
||||
{
|
||||
EXPECT_FALSE(cmdQHw->isSplitEnqueueBlitNeeded(TransferDirection::HostToHost, cmdQHw->getGpgpuCommandStreamReceiver()));
|
||||
EXPECT_FALSE(cmdQHw->isSplitEnqueueBlitNeeded(TransferDirection::LocalToLocal, cmdQHw->getGpgpuCommandStreamReceiver()));
|
||||
EXPECT_FALSE(cmdQHw->isSplitEnqueueBlitNeeded(TransferDirection::LocalToHost, cmdQHw->getGpgpuCommandStreamReceiver()));
|
||||
EXPECT_FALSE(cmdQHw->isSplitEnqueueBlitNeeded(TransferDirection::HostToLocal, cmdQHw->getGpgpuCommandStreamReceiver()));
|
||||
}
|
||||
{
|
||||
DebugManager.flags.SplitBcsCopy.set(0);
|
||||
EXPECT_FALSE(cmdQHw->isSplitEnqueueBlitNeeded(TransferDirection::HostToHost, *cmdQHw->getBcsCommandStreamReceiver(aub_stream::EngineType::ENGINE_BCS)));
|
||||
EXPECT_FALSE(cmdQHw->isSplitEnqueueBlitNeeded(TransferDirection::LocalToLocal, *cmdQHw->getBcsCommandStreamReceiver(aub_stream::EngineType::ENGINE_BCS)));
|
||||
EXPECT_FALSE(cmdQHw->isSplitEnqueueBlitNeeded(TransferDirection::LocalToHost, *cmdQHw->getBcsCommandStreamReceiver(aub_stream::EngineType::ENGINE_BCS)));
|
||||
EXPECT_FALSE(cmdQHw->isSplitEnqueueBlitNeeded(TransferDirection::HostToLocal, *cmdQHw->getBcsCommandStreamReceiver(aub_stream::EngineType::ENGINE_BCS)));
|
||||
}
|
||||
}
|
||||
|
||||
HWTEST_F(IoqCommandQueueHwBlitTest, givenSplitBcsCopyWhenEnqueueReadThenEnqueueBlitSplit) {
|
||||
DebugManagerStateRestore restorer;
|
||||
DebugManager.flags.SplitBcsCopy.set(1);
|
||||
DebugManager.flags.DoCpuCopyOnReadBuffer.set(0);
|
||||
DebugManager.flags.UpdateTaskCountFromWait.set(3);
|
||||
auto cmdQHw = static_cast<MockCommandQueueHw<FamilyType> *>(this->pCmdQ);
|
||||
|
||||
auto csr1 = std::make_unique<CommandStreamReceiverHw<FamilyType>>(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield());
|
||||
std::unique_ptr<OsContext> osContext1(OsContext::create(pDevice->getExecutionEnvironment()->rootDeviceEnvironments[0]->osInterface.get(), 0,
|
||||
EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS2, EngineUsage::Regular},
|
||||
PreemptionMode::ThreadGroup, pDevice->getDeviceBitfield())));
|
||||
csr1->setupContext(*osContext1);
|
||||
csr1->initializeTagAllocation();
|
||||
EngineControl control1(csr1.get(), osContext1.get());
|
||||
|
||||
auto csr2 = std::make_unique<CommandStreamReceiverHw<FamilyType>>(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield());
|
||||
std::unique_ptr<OsContext> osContext2(OsContext::create(pDevice->getExecutionEnvironment()->rootDeviceEnvironments[0]->osInterface.get(), 0,
|
||||
EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS4, EngineUsage::Regular},
|
||||
PreemptionMode::ThreadGroup, pDevice->getDeviceBitfield())));
|
||||
csr2->setupContext(*osContext2);
|
||||
csr2->initializeTagAllocation();
|
||||
EngineControl control2(csr2.get(), osContext2.get());
|
||||
|
||||
cmdQHw->bcsEngines[2] = &control1;
|
||||
cmdQHw->bcsEngines[4] = &control2;
|
||||
|
||||
BufferDefaults::context = context;
|
||||
auto buffer = clUniquePtr(BufferHelper<>::create());
|
||||
static_cast<MockGraphicsAllocation *>(buffer->getGraphicsAllocation(0u))->memoryPool = MemoryPool::LocalMemory;
|
||||
char ptr[1] = {};
|
||||
|
||||
EXPECT_EQ(csr1->peekTaskCount(), 0u);
|
||||
EXPECT_EQ(csr2->peekTaskCount(), 0u);
|
||||
EXPECT_EQ(cmdQHw->getGpgpuCommandStreamReceiver().peekTaskCount(), 0u);
|
||||
EXPECT_EQ(cmdQHw->getBcsCommandStreamReceiver(aub_stream::EngineType::ENGINE_BCS)->peekTaskCount(), 0u);
|
||||
|
||||
EXPECT_EQ(CL_SUCCESS, cmdQHw->enqueueReadBuffer(buffer.get(), CL_FALSE, 0, 1u, ptr, nullptr, 0, nullptr, nullptr));
|
||||
|
||||
EXPECT_EQ(csr1->peekTaskCount(), 1u);
|
||||
EXPECT_EQ(csr2->peekTaskCount(), 1u);
|
||||
EXPECT_EQ(cmdQHw->getGpgpuCommandStreamReceiver().peekTaskCount(), 0u);
|
||||
EXPECT_EQ(cmdQHw->getBcsCommandStreamReceiver(aub_stream::EngineType::ENGINE_BCS)->peekTaskCount(), 0u);
|
||||
|
||||
pCmdQ->release();
|
||||
pCmdQ = nullptr;
|
||||
}
|
||||
|
||||
HWTEST_F(IoqCommandQueueHwBlitTest, givenSplitBcsCopyWhenEnqueueBlockingReadThenEnqueueBlitSplit) {
|
||||
DebugManagerStateRestore restorer;
|
||||
DebugManager.flags.SplitBcsCopy.set(1);
|
||||
DebugManager.flags.DoCpuCopyOnReadBuffer.set(0);
|
||||
DebugManager.flags.UpdateTaskCountFromWait.set(3);
|
||||
auto cmdQHw = static_cast<MockCommandQueueHw<FamilyType> *>(this->pCmdQ);
|
||||
|
||||
auto csr1 = std::make_unique<CommandStreamReceiverHw<FamilyType>>(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield());
|
||||
std::unique_ptr<OsContext> osContext1(OsContext::create(pDevice->getExecutionEnvironment()->rootDeviceEnvironments[0]->osInterface.get(), 0,
|
||||
EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS2, EngineUsage::Regular},
|
||||
PreemptionMode::ThreadGroup, pDevice->getDeviceBitfield())));
|
||||
csr1->setupContext(*osContext1);
|
||||
csr1->initializeTagAllocation();
|
||||
EngineControl control1(csr1.get(), osContext1.get());
|
||||
|
||||
auto csr2 = std::make_unique<CommandStreamReceiverHw<FamilyType>>(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield());
|
||||
std::unique_ptr<OsContext> osContext2(OsContext::create(pDevice->getExecutionEnvironment()->rootDeviceEnvironments[0]->osInterface.get(), 0,
|
||||
EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS4, EngineUsage::Regular},
|
||||
PreemptionMode::ThreadGroup, pDevice->getDeviceBitfield())));
|
||||
csr2->setupContext(*osContext2);
|
||||
csr2->initializeTagAllocation();
|
||||
EngineControl control2(csr2.get(), osContext2.get());
|
||||
|
||||
cmdQHw->bcsEngines[2] = &control1;
|
||||
cmdQHw->bcsEngines[4] = &control2;
|
||||
|
||||
BufferDefaults::context = context;
|
||||
auto buffer = clUniquePtr(BufferHelper<>::create());
|
||||
static_cast<MockGraphicsAllocation *>(buffer->getGraphicsAllocation(0u))->memoryPool = MemoryPool::LocalMemory;
|
||||
char ptr[1] = {};
|
||||
|
||||
EXPECT_EQ(csr1->peekTaskCount(), 0u);
|
||||
EXPECT_EQ(csr2->peekTaskCount(), 0u);
|
||||
EXPECT_EQ(cmdQHw->getGpgpuCommandStreamReceiver().peekTaskCount(), 0u);
|
||||
EXPECT_EQ(cmdQHw->getBcsCommandStreamReceiver(aub_stream::EngineType::ENGINE_BCS)->peekTaskCount(), 0u);
|
||||
|
||||
EXPECT_EQ(CL_SUCCESS, cmdQHw->enqueueReadBuffer(buffer.get(), CL_TRUE, 0, 1u, ptr, nullptr, 0, nullptr, nullptr));
|
||||
|
||||
EXPECT_EQ(csr1->peekTaskCount(), 2u);
|
||||
EXPECT_EQ(csr2->peekTaskCount(), 2u);
|
||||
EXPECT_EQ(cmdQHw->getGpgpuCommandStreamReceiver().peekTaskCount(), 0u);
|
||||
EXPECT_EQ(cmdQHw->getBcsCommandStreamReceiver(aub_stream::EngineType::ENGINE_BCS)->peekTaskCount(), 0u);
|
||||
|
||||
pCmdQ->release();
|
||||
pCmdQ = nullptr;
|
||||
}
|
||||
|
||||
HWTEST_F(IoqCommandQueueHwBlitTest, givenSplitBcsCopyWhenEnqueueReadWithEventThenEnqueueBlitSplitAndAddBothTimestampsToEvent) {
|
||||
DebugManagerStateRestore restorer;
|
||||
DebugManager.flags.SplitBcsCopy.set(1);
|
||||
DebugManager.flags.DoCpuCopyOnReadBuffer.set(0);
|
||||
DebugManager.flags.UpdateTaskCountFromWait.set(3);
|
||||
auto cmdQHw = static_cast<MockCommandQueueHw<FamilyType> *>(this->pCmdQ);
|
||||
|
||||
auto csr1 = std::make_unique<CommandStreamReceiverHw<FamilyType>>(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield());
|
||||
std::unique_ptr<OsContext> osContext1(OsContext::create(pDevice->getExecutionEnvironment()->rootDeviceEnvironments[0]->osInterface.get(), 0,
|
||||
EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS2, EngineUsage::Regular},
|
||||
PreemptionMode::ThreadGroup, pDevice->getDeviceBitfield())));
|
||||
csr1->setupContext(*osContext1);
|
||||
csr1->initializeTagAllocation();
|
||||
EngineControl control1(csr1.get(), osContext1.get());
|
||||
|
||||
auto csr2 = std::make_unique<CommandStreamReceiverHw<FamilyType>>(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield());
|
||||
std::unique_ptr<OsContext> osContext2(OsContext::create(pDevice->getExecutionEnvironment()->rootDeviceEnvironments[0]->osInterface.get(), 0,
|
||||
EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS4, EngineUsage::Regular},
|
||||
PreemptionMode::ThreadGroup, pDevice->getDeviceBitfield())));
|
||||
csr2->setupContext(*osContext2);
|
||||
csr2->initializeTagAllocation();
|
||||
EngineControl control2(csr2.get(), osContext2.get());
|
||||
|
||||
cmdQHw->bcsEngines[2] = &control1;
|
||||
cmdQHw->bcsEngines[4] = &control2;
|
||||
|
||||
BufferDefaults::context = context;
|
||||
auto buffer = clUniquePtr(BufferHelper<>::create());
|
||||
static_cast<MockGraphicsAllocation *>(buffer->getGraphicsAllocation(0u))->memoryPool = MemoryPool::LocalMemory;
|
||||
char ptr[1] = {};
|
||||
|
||||
EXPECT_EQ(csr1->peekTaskCount(), 0u);
|
||||
EXPECT_EQ(csr2->peekTaskCount(), 0u);
|
||||
EXPECT_EQ(cmdQHw->getGpgpuCommandStreamReceiver().peekTaskCount(), 0u);
|
||||
EXPECT_EQ(cmdQHw->getBcsCommandStreamReceiver(aub_stream::EngineType::ENGINE_BCS)->peekTaskCount(), 0u);
|
||||
|
||||
cl_event event;
|
||||
EXPECT_EQ(CL_SUCCESS, cmdQHw->enqueueReadBuffer(buffer.get(), CL_FALSE, 0, 1u, ptr, nullptr, 0, nullptr, &event));
|
||||
|
||||
EXPECT_EQ(csr1->peekTaskCount(), 1u);
|
||||
EXPECT_EQ(csr2->peekTaskCount(), 1u);
|
||||
EXPECT_EQ(cmdQHw->getGpgpuCommandStreamReceiver().peekTaskCount(), 0u);
|
||||
EXPECT_EQ(cmdQHw->getBcsCommandStreamReceiver(aub_stream::EngineType::ENGINE_BCS)->peekTaskCount(), 0u);
|
||||
|
||||
EXPECT_NE(event, nullptr);
|
||||
auto pEvent = castToObject<Event>(event);
|
||||
EXPECT_EQ(pEvent->getTimestampPacketNodes()->peekNodes().size(), 3u);
|
||||
clReleaseEvent(event);
|
||||
|
||||
pCmdQ->release();
|
||||
pCmdQ = nullptr;
|
||||
}
|
||||
|
||||
HWTEST_F(IoqCommandQueueHwBlitTest, givenGpgpuCsrWhenEnqueueingSubsequentBlitsThenGpgpuCommandStreamIsNotObtained) {
|
||||
auto &gpgpuCsr = pDevice->getUltCommandStreamReceiver<FamilyType>();
|
||||
auto srcBuffer = std::unique_ptr<Buffer>{BufferHelper<>::create(pContext)};
|
||||
|
|
|
@ -104,6 +104,109 @@ HWTEST2_F(CommandQueuePvcAndLaterTests, givenDeferCmdQBcsInitializationEnabledWh
|
|||
EXPECT_EQ(0u, queue->countBcsEngines());
|
||||
}
|
||||
|
||||
HWTEST2_F(CommandQueuePvcAndLaterTests, whenConstructBcsEnginesForSplitThenContainsMultipleBcsEngines, IsAtLeastXeHpcCore) {
|
||||
DebugManagerStateRestore restorer;
|
||||
DebugManager.flags.DeferCmdQBcsInitialization.set(1u);
|
||||
HardwareInfo hwInfo = *defaultHwInfo;
|
||||
hwInfo.featureTable.ftrBcsInfo = maxNBitValue(9);
|
||||
hwInfo.capabilityTable.blitterOperationsSupported = true;
|
||||
MockDevice *device = MockDevice::createWithNewExecutionEnvironment<MockDevice>(&hwInfo, 0);
|
||||
MockClDevice clDevice{device};
|
||||
cl_device_id clDeviceId = static_cast<cl_device_id>(&clDevice);
|
||||
ClDeviceVector clDevices{&clDeviceId, 1u};
|
||||
cl_int retVal{};
|
||||
auto context = std::unique_ptr<Context>{Context::create<Context>(nullptr, clDevices, nullptr, nullptr, retVal)};
|
||||
EXPECT_EQ(CL_SUCCESS, retVal);
|
||||
auto queue = std::make_unique<MockCommandQueue>(*context);
|
||||
EXPECT_EQ(0u, queue->countBcsEngines());
|
||||
|
||||
queue->constructBcsEnginesForSplit();
|
||||
|
||||
EXPECT_EQ(4u, queue->countBcsEngines());
|
||||
|
||||
queue->constructBcsEnginesForSplit();
|
||||
|
||||
EXPECT_EQ(4u, queue->countBcsEngines());
|
||||
}
|
||||
|
||||
HWTEST2_F(CommandQueuePvcAndLaterTests, whenSelectCsrForHostPtrAllocationThenReturnProperEngine, IsAtLeastXeHpcCore) {
|
||||
DebugManagerStateRestore restorer;
|
||||
DebugManager.flags.DeferCmdQBcsInitialization.set(1u);
|
||||
HardwareInfo hwInfo = *defaultHwInfo;
|
||||
hwInfo.featureTable.ftrBcsInfo = maxNBitValue(9);
|
||||
hwInfo.capabilityTable.blitterOperationsSupported = true;
|
||||
MockDevice *device = MockDevice::createWithNewExecutionEnvironment<MockDevice>(&hwInfo, 0);
|
||||
MockClDevice clDevice{device};
|
||||
cl_device_id clDeviceId = static_cast<cl_device_id>(&clDevice);
|
||||
ClDeviceVector clDevices{&clDeviceId, 1u};
|
||||
cl_int retVal{};
|
||||
auto context = std::unique_ptr<Context>{Context::create<Context>(nullptr, clDevices, nullptr, nullptr, retVal)};
|
||||
EXPECT_EQ(CL_SUCCESS, retVal);
|
||||
auto queue = std::make_unique<MockCommandQueue>(*context);
|
||||
EXPECT_EQ(0u, queue->countBcsEngines());
|
||||
queue->constructBcsEnginesForSplit();
|
||||
EXPECT_EQ(4u, queue->countBcsEngines());
|
||||
|
||||
auto &csr1 = queue->selectCsrForHostPtrAllocation(true, *queue->getBcsCommandStreamReceiver(aub_stream::EngineType::ENGINE_BCS2));
|
||||
EXPECT_EQ(&csr1, &queue->getGpgpuCommandStreamReceiver());
|
||||
|
||||
auto &csr2 = queue->selectCsrForHostPtrAllocation(false, *queue->getBcsCommandStreamReceiver(aub_stream::EngineType::ENGINE_BCS2));
|
||||
EXPECT_EQ(&csr2, queue->getBcsCommandStreamReceiver(aub_stream::EngineType::ENGINE_BCS2));
|
||||
}
|
||||
|
||||
HWTEST2_F(CommandQueuePvcAndLaterTests, whenPrepareHostPtrSurfaceForSplitThenSetTaskCountsToZero, IsAtLeastXeHpcCore) {
|
||||
DebugManagerStateRestore restorer;
|
||||
DebugManager.flags.DeferCmdQBcsInitialization.set(1u);
|
||||
HardwareInfo hwInfo = *defaultHwInfo;
|
||||
hwInfo.featureTable.ftrBcsInfo = maxNBitValue(9);
|
||||
hwInfo.capabilityTable.blitterOperationsSupported = true;
|
||||
MockDevice *device = MockDevice::createWithNewExecutionEnvironment<MockDevice>(&hwInfo, 0);
|
||||
MockClDevice clDevice{device};
|
||||
cl_device_id clDeviceId = static_cast<cl_device_id>(&clDevice);
|
||||
ClDeviceVector clDevices{&clDeviceId, 1u};
|
||||
cl_int retVal{};
|
||||
auto context = std::unique_ptr<Context>{Context::create<Context>(nullptr, clDevices, nullptr, nullptr, retVal)};
|
||||
EXPECT_EQ(CL_SUCCESS, retVal);
|
||||
auto queue = std::make_unique<MockCommandQueue>(*context);
|
||||
EXPECT_EQ(0u, queue->countBcsEngines());
|
||||
queue->constructBcsEnginesForSplit();
|
||||
EXPECT_EQ(4u, queue->countBcsEngines());
|
||||
auto ptr = reinterpret_cast<void *>(0x1234);
|
||||
auto ptrSize = MemoryConstants::pageSize;
|
||||
HostPtrSurface hostPtrSurf(ptr, ptrSize);
|
||||
queue->getGpgpuCommandStreamReceiver().createAllocationForHostSurface(hostPtrSurf, false);
|
||||
|
||||
queue->prepareHostPtrSurfaceForSplit(false, *hostPtrSurf.getAllocation());
|
||||
|
||||
for (auto i = static_cast<uint32_t>(aub_stream::EngineType::ENGINE_BCS1); i <= static_cast<uint32_t>(aub_stream::EngineType::ENGINE_BCS8); i++) {
|
||||
auto bcs = queue->getBcsCommandStreamReceiver(static_cast<aub_stream::EngineType>(i));
|
||||
if (bcs) {
|
||||
auto contextId = bcs->getOsContext().getContextId();
|
||||
EXPECT_EQ(hostPtrSurf.getAllocation()->getTaskCount(contextId), GraphicsAllocation::objectNotUsed);
|
||||
}
|
||||
}
|
||||
|
||||
queue->prepareHostPtrSurfaceForSplit(true, *hostPtrSurf.getAllocation());
|
||||
|
||||
for (auto i = static_cast<uint32_t>(aub_stream::EngineType::ENGINE_BCS1); i <= static_cast<uint32_t>(aub_stream::EngineType::ENGINE_BCS8); i++) {
|
||||
auto bcs = queue->getBcsCommandStreamReceiver(static_cast<aub_stream::EngineType>(i));
|
||||
if (bcs) {
|
||||
auto contextId = bcs->getOsContext().getContextId();
|
||||
EXPECT_EQ(hostPtrSurf.getAllocation()->getTaskCount(contextId), 0u);
|
||||
}
|
||||
}
|
||||
|
||||
queue->prepareHostPtrSurfaceForSplit(true, *hostPtrSurf.getAllocation());
|
||||
|
||||
for (auto i = static_cast<uint32_t>(aub_stream::EngineType::ENGINE_BCS1); i <= static_cast<uint32_t>(aub_stream::EngineType::ENGINE_BCS8); i++) {
|
||||
auto bcs = queue->getBcsCommandStreamReceiver(static_cast<aub_stream::EngineType>(i));
|
||||
if (bcs) {
|
||||
auto contextId = bcs->getOsContext().getContextId();
|
||||
EXPECT_EQ(hostPtrSurf.getAllocation()->getTaskCount(contextId), 0u);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
HWTEST2_F(CommandQueuePvcAndLaterTests, givenDeferCmdQBcsInitializationDisabledWhenCreateCommandQueueThenBcsIsInitialized, IsAtLeastXeHpcCore) {
|
||||
DebugManagerStateRestore restorer;
|
||||
DebugManager.flags.DeferCmdQBcsInitialization.set(0u);
|
||||
|
|
|
@ -287,6 +287,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, AssignBCSAtEnqueue, -1, "-1: default, 0:disabled
|
|||
DECLARE_DEBUG_VARIABLE(int32_t, DeferCmdQGpgpuInitialization, -1, "-1: default, 0:disabled, 1: enabled.")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, DeferCmdQBcsInitialization, -1, "-1: default, 0:disabled, 1: enabled.")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, PreferInternalBcsEngine, -1, "-1: default, 0:disabled, 1: enabled. When enabled use internal BCS engine for internal transfers, when disabled use regular engine")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, SplitBcsCopy, -1, "-1: default, 0:disabled, 1: enabled. When enqueues copy to main copy engine then split between even linked copy engines")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ReuseKernelBinaries, -1, "-1: default, 0:disabled, 1: enabled. If enabled, driver reuses kernel binaries.")
|
||||
|
||||
/*DIRECT SUBMISSION FLAGS*/
|
||||
|
|
|
@ -125,6 +125,7 @@ class HwInfoConfig {
|
|||
virtual bool isTilePlacementResourceWaRequired(const HardwareInfo &hwInfo) const = 0;
|
||||
virtual bool allowMemoryPrefetch(const HardwareInfo &hwInfo) const = 0;
|
||||
virtual bool isBcsReportWaRequired(const HardwareInfo &hwInfo) const = 0;
|
||||
virtual bool isBlitSplitEnqueueWARequired(const HardwareInfo &hwInfo) const = 0;
|
||||
virtual bool isBlitCopyRequiredForLocalMemory(const HardwareInfo &hwInfo, const GraphicsAllocation &allocation) const = 0;
|
||||
virtual bool isImplicitScalingSupported(const HardwareInfo &hwInfo) const = 0;
|
||||
virtual bool isCpuCopyNecessary(const void *ptr, MemoryManager *memoryManager) const = 0;
|
||||
|
@ -227,6 +228,7 @@ class HwInfoConfigHw : public HwInfoConfig {
|
|||
bool isCooperativeEngineSupported(const HardwareInfo &hwInfo) const override;
|
||||
bool isTimestampWaitSupportedForEvents() const override;
|
||||
bool isTilePlacementResourceWaRequired(const HardwareInfo &hwInfo) const override;
|
||||
bool isBlitSplitEnqueueWARequired(const HardwareInfo &hwInfo) const override;
|
||||
bool allowMemoryPrefetch(const HardwareInfo &hwInfo) const override;
|
||||
bool isBcsReportWaRequired(const HardwareInfo &hwInfo) const override;
|
||||
bool isBlitCopyRequiredForLocalMemory(const HardwareInfo &hwInfo, const GraphicsAllocation &allocation) const override;
|
||||
|
|
|
@ -455,6 +455,11 @@ bool HwInfoConfigHw<gfxProduct>::isBcsReportWaRequired(const HardwareInfo &hwInf
|
|||
return false;
|
||||
}
|
||||
|
||||
template <PRODUCT_FAMILY gfxProduct>
|
||||
bool HwInfoConfigHw<gfxProduct>::isBlitSplitEnqueueWARequired(const HardwareInfo &hwInfo) const {
|
||||
return false;
|
||||
}
|
||||
|
||||
template <PRODUCT_FAMILY gfxProduct>
|
||||
bool HwInfoConfigHw<gfxProduct>::isBlitCopyRequiredForLocalMemory(const HardwareInfo &hwInfo, const GraphicsAllocation &allocation) const {
|
||||
return allocation.isAllocatedInLocalMemoryPool() &&
|
||||
|
|
|
@ -227,9 +227,7 @@ class StackVec { // NOLINT(clang-analyzer-optin.performance.Padding)
|
|||
}
|
||||
|
||||
void push_back(const DataType &v) { // NOLINT(readability-identifier-naming)
|
||||
if (onStackSize == onStackCaps) {
|
||||
ensureDynamicMem();
|
||||
}
|
||||
isDynamicMemNeeded();
|
||||
|
||||
if (usesDynamicMem()) {
|
||||
dynamicMem->push_back(v);
|
||||
|
@ -240,6 +238,18 @@ class StackVec { // NOLINT(clang-analyzer-optin.performance.Padding)
|
|||
++onStackSize;
|
||||
}
|
||||
|
||||
void push_back(DataType &&v) { // NOLINT(readability-identifier-naming)
|
||||
isDynamicMemNeeded();
|
||||
|
||||
if (usesDynamicMem()) {
|
||||
dynamicMem->push_back(std::move(v));
|
||||
return;
|
||||
}
|
||||
|
||||
new (reinterpret_cast<DataType *>(onStackMemRawBytes) + onStackSize) DataType(std::move(v));
|
||||
++onStackSize;
|
||||
}
|
||||
|
||||
void sort() {
|
||||
std::sort(this->begin(), this->end());
|
||||
}
|
||||
|
@ -399,6 +409,12 @@ class StackVec { // NOLINT(clang-analyzer-optin.performance.Padding)
|
|||
}
|
||||
}
|
||||
|
||||
void isDynamicMemNeeded() {
|
||||
if (onStackSize == onStackCaps) {
|
||||
ensureDynamicMem();
|
||||
}
|
||||
}
|
||||
|
||||
void ensureDynamicMem() {
|
||||
if (usesDynamicMem()) {
|
||||
return;
|
||||
|
|
|
@ -393,6 +393,7 @@ MakeEachAllocationResident = -1
|
|||
AssignBCSAtEnqueue = -1
|
||||
DeferCmdQGpgpuInitialization = -1
|
||||
DeferCmdQBcsInitialization = -1
|
||||
SplitBcsCopy = -1
|
||||
PreferInternalBcsEngine = -1
|
||||
ReuseKernelBinaries = -1
|
||||
EnableChipsetUniqueUUID = -1
|
||||
|
|
|
@ -52,6 +52,11 @@ HWTEST_F(HwInfoConfigTest, givenHwInfoConfigWhenGettingSharedSystemMemCapabiliti
|
|||
}
|
||||
}
|
||||
|
||||
HWTEST_F(HwInfoConfigTest, givenHwInfoConfigWhenAskedIfIsBlitSplitEnqueueWARequiredThenReturnFalse) {
|
||||
auto hwInfoConfig = HwInfoConfig::get(productFamily);
|
||||
EXPECT_FALSE(hwInfoConfig->isBlitSplitEnqueueWARequired(pInHwInfo));
|
||||
}
|
||||
|
||||
HWTEST_F(HwInfoConfigTest, givenHwInfoConfigWhenGettingMemoryCapabilitiesThenCorrectValueIsReturned) {
|
||||
DebugManagerStateRestore restore;
|
||||
|
||||
|
|
|
@ -7,6 +7,7 @@
|
|||
|
||||
#include "shared/source/helpers/compiler_hw_info_config.h"
|
||||
#include "shared/source/os_interface/os_interface.h"
|
||||
#include "shared/test/common/helpers/debug_manager_state_restore.h"
|
||||
#include "shared/test/common/helpers/default_hw_info.h"
|
||||
#include "shared/test/common/helpers/gtest_helpers.h"
|
||||
#include "shared/test/common/helpers/variable_backup.h"
|
||||
|
|
Loading…
Reference in New Issue