mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-25 21:42:53 +08:00
In constructor of CommandComputeKernel we had been doing multiple allocations of memory on heap due to lack of call to std::vector copy-constructor or reserve member function. Furthermore, in production code there is only one place, where we create objects of this type and we redundantly copy the local variable, which could be moved. This change: - ensures that constructor of CommandComputeKernel performs single allocation in the worst case; in the best case, it does not allocate memory due to usage of std::move on input parameter - steals the memory of the local variable in place of usage of the constructor to remove redundant copying and memory allocations - uses reserve() method to reduce the number of allocations during creation of this local variable Signed-off-by: Patryk Wrobel <patryk.wrobel@intel.com>
1202 lines
59 KiB
C++
1202 lines
59 KiB
C++
/*
|
|
* Copyright (C) 2018-2022 Intel Corporation
|
|
*
|
|
* SPDX-License-Identifier: MIT
|
|
*
|
|
*/
|
|
|
|
#pragma once
|
|
#include "shared/source/built_ins/built_ins.h"
|
|
#include "shared/source/command_stream/command_stream_receiver.h"
|
|
#include "shared/source/helpers/array_count.h"
|
|
#include "shared/source/helpers/engine_node_helper.h"
|
|
#include "shared/source/helpers/local_work_size.h"
|
|
#include "shared/source/helpers/pipe_control_args.h"
|
|
#include "shared/source/memory_manager/internal_allocation_storage.h"
|
|
#include "shared/source/memory_manager/memory_manager.h"
|
|
#include "shared/source/memory_manager/surface.h"
|
|
#include "shared/source/os_interface/os_context.h"
|
|
#include "shared/source/program/sync_buffer_handler.h"
|
|
#include "shared/source/program/sync_buffer_handler.inl"
|
|
#include "shared/source/utilities/range.h"
|
|
#include "shared/source/utilities/tag_allocator.h"
|
|
|
|
#include "opencl/source/built_ins/builtins_dispatch_builder.h"
|
|
#include "opencl/source/command_queue/command_queue_hw.h"
|
|
#include "opencl/source/command_queue/gpgpu_walker.h"
|
|
#include "opencl/source/command_queue/hardware_interface.h"
|
|
#include "opencl/source/event/event_builder.h"
|
|
#include "opencl/source/event/user_event.h"
|
|
#include "opencl/source/gtpin/gtpin_notify.h"
|
|
#include "opencl/source/helpers/cl_blit_properties.h"
|
|
#include "opencl/source/helpers/cl_hw_helper.h"
|
|
#include "opencl/source/helpers/cl_preemption_helper.h"
|
|
#include "opencl/source/helpers/dispatch_info_builder.h"
|
|
#include "opencl/source/helpers/enqueue_properties.h"
|
|
#include "opencl/source/helpers/hardware_commands_helper.h"
|
|
#include "opencl/source/helpers/task_information.h"
|
|
#include "opencl/source/mem_obj/buffer.h"
|
|
#include "opencl/source/mem_obj/image.h"
|
|
#include "opencl/source/memory_manager/migration_controller.h"
|
|
#include "opencl/source/program/printf_handler.h"
|
|
#include "opencl/source/utilities/cl_logger.h"
|
|
|
|
#include <algorithm>
|
|
#include <new>
|
|
|
|
namespace NEO {
|
|
|
|
template <typename GfxFamily>
|
|
template <uint32_t commandType, size_t surfaceCount>
|
|
void CommandQueueHw<GfxFamily>::enqueueHandler(Surface *(&surfaces)[surfaceCount],
|
|
bool blocking,
|
|
Kernel *kernel,
|
|
cl_uint workDim,
|
|
const size_t globalOffsets[3],
|
|
const size_t workItems[3],
|
|
const size_t *localWorkSizesIn,
|
|
const size_t *enqueuedWorkSizes,
|
|
cl_uint numEventsInWaitList,
|
|
const cl_event *eventWaitList,
|
|
cl_event *event) {
|
|
BuiltInOwnershipWrapper builtInLock;
|
|
KernelObjsForAuxTranslation kernelObjsForAuxTranslation;
|
|
MultiDispatchInfo multiDispatchInfo(kernel);
|
|
|
|
auto auxTranslationMode = AuxTranslationMode::None;
|
|
|
|
kernel->updateAuxTranslationRequired();
|
|
if (kernel->isAuxTranslationRequired()) {
|
|
kernel->fillWithKernelObjsForAuxTranslation(kernelObjsForAuxTranslation);
|
|
multiDispatchInfo.setKernelObjsForAuxTranslation(kernelObjsForAuxTranslation);
|
|
|
|
if (!kernelObjsForAuxTranslation.empty()) {
|
|
auxTranslationMode = HwHelperHw<GfxFamily>::get().getAuxTranslationMode(device->getHardwareInfo());
|
|
}
|
|
}
|
|
|
|
if (AuxTranslationMode::Builtin == auxTranslationMode) {
|
|
auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(EBuiltInOps::AuxTranslation, getClDevice());
|
|
builtInLock.takeOwnership(builder, this->context);
|
|
|
|
dispatchAuxTranslationBuiltin(multiDispatchInfo, AuxTranslationDirection::AuxToNonAux);
|
|
}
|
|
|
|
if (kernel->getKernelInfo().builtinDispatchBuilder == nullptr) {
|
|
DispatchInfoBuilder<SplitDispatch::Dim::d3D, SplitDispatch::SplitMode::WalkerSplit> builder(getClDevice());
|
|
builder.setDispatchGeometry(workDim, workItems, enqueuedWorkSizes, globalOffsets, Vec3<size_t>{0, 0, 0}, localWorkSizesIn);
|
|
builder.setKernel(kernel);
|
|
builder.bake(multiDispatchInfo);
|
|
} else {
|
|
auto builder = kernel->getKernelInfo().builtinDispatchBuilder;
|
|
builder->buildDispatchInfos(multiDispatchInfo, kernel, workDim, workItems, enqueuedWorkSizes, globalOffsets);
|
|
|
|
if (multiDispatchInfo.size() == 0) {
|
|
return;
|
|
}
|
|
}
|
|
|
|
if (AuxTranslationMode::Builtin == auxTranslationMode) {
|
|
dispatchAuxTranslationBuiltin(multiDispatchInfo, AuxTranslationDirection::NonAuxToAux);
|
|
}
|
|
|
|
if (AuxTranslationMode::Blit == auxTranslationMode) {
|
|
setupBlitAuxTranslation(multiDispatchInfo);
|
|
}
|
|
|
|
enqueueHandler<commandType>(surfaces, blocking, multiDispatchInfo, numEventsInWaitList, eventWaitList, event);
|
|
}
|
|
|
|
template <typename GfxFamily>
|
|
template <uint32_t commandType>
|
|
void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
|
|
size_t numSurfaceForResidency,
|
|
bool blocking,
|
|
const MultiDispatchInfo &multiDispatchInfo,
|
|
cl_uint numEventsInWaitList,
|
|
const cl_event *eventWaitList,
|
|
cl_event *event) {
|
|
if (multiDispatchInfo.empty() && !isCommandWithoutKernel(commandType)) {
|
|
enqueueHandler<CL_COMMAND_MARKER>(nullptr, 0, blocking, multiDispatchInfo,
|
|
numEventsInWaitList, eventWaitList, event);
|
|
if (event) {
|
|
castToObjectOrAbort<Event>(*event)->setCmdType(commandType);
|
|
}
|
|
return;
|
|
}
|
|
|
|
TagNodeBase *hwTimeStamps = nullptr;
|
|
CommandStreamReceiver &computeCommandStreamReceiver = getGpgpuCommandStreamReceiver();
|
|
|
|
EventBuilder eventBuilder;
|
|
setupEvent(eventBuilder, event, commandType);
|
|
|
|
bool isMarkerWithProfiling = (CL_COMMAND_MARKER == commandType) && (eventBuilder.getEvent() && eventBuilder.getEvent()->isProfilingEnabled());
|
|
|
|
std::unique_ptr<KernelOperation> blockedCommandsData;
|
|
std::unique_ptr<PrintfHandler> printfHandler;
|
|
TakeOwnershipWrapper<CommandQueueHw<GfxFamily>> queueOwnership(*this);
|
|
auto commandStreamReceiverOwnership = computeCommandStreamReceiver.obtainUniqueOwnership();
|
|
|
|
auto blockQueue = false;
|
|
auto taskLevel = 0u;
|
|
obtainTaskLevelAndBlockedStatus(taskLevel, numEventsInWaitList, eventWaitList, blockQueue, commandType);
|
|
|
|
enqueueHandlerHook(commandType, multiDispatchInfo);
|
|
|
|
bool clearDependenciesForSubCapture = false;
|
|
aubCaptureHook(blocking, clearDependenciesForSubCapture, multiDispatchInfo);
|
|
|
|
bool clearAllDependencies = (queueDependenciesClearRequired() || clearDependenciesForSubCapture);
|
|
|
|
if (DebugManager.flags.MakeEachEnqueueBlocking.get()) {
|
|
blocking = true;
|
|
}
|
|
|
|
TimestampPacketDependencies timestampPacketDependencies;
|
|
EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, event);
|
|
CsrDependencies csrDeps;
|
|
BlitPropertiesContainer blitPropertiesContainer;
|
|
|
|
if (this->context->getRootDeviceIndices().size() > 1) {
|
|
eventsRequest.fillCsrDependenciesForTaskCountContainer(csrDeps, computeCommandStreamReceiver);
|
|
}
|
|
|
|
bool enqueueWithBlitAuxTranslation = isBlitAuxTranslationRequired(multiDispatchInfo);
|
|
|
|
if (computeCommandStreamReceiver.peekTimestampPacketWriteEnabled()) {
|
|
if (!clearDependenciesForSubCapture) {
|
|
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, computeCommandStreamReceiver, CsrDependencies::DependenciesType::OnCsr);
|
|
}
|
|
|
|
auto allocator = computeCommandStreamReceiver.getTimestampPacketAllocator();
|
|
|
|
size_t nodesCount = 0u;
|
|
if (isCacheFlushCommand(commandType) || isMarkerWithProfiling) {
|
|
nodesCount = 1;
|
|
} else if (!multiDispatchInfo.empty()) {
|
|
nodesCount = estimateTimestampPacketNodesCount(multiDispatchInfo);
|
|
}
|
|
|
|
if (isCacheFlushForBcsRequired() && enqueueWithBlitAuxTranslation) {
|
|
// Cache flush for aux translation is always required (if supported)
|
|
timestampPacketDependencies.cacheFlushNodes.add(allocator->getTag());
|
|
}
|
|
|
|
if (nodesCount > 0) {
|
|
obtainNewTimestampPacketNodes(nodesCount, timestampPacketDependencies.previousEnqueueNodes, clearAllDependencies, computeCommandStreamReceiver);
|
|
csrDeps.timestampPacketContainer.push_back(×tampPacketDependencies.previousEnqueueNodes);
|
|
}
|
|
}
|
|
|
|
auto &commandStream = *obtainCommandStream<commandType>(csrDeps, false, blockQueue, multiDispatchInfo, eventsRequest,
|
|
blockedCommandsData, surfacesForResidency, numSurfaceForResidency, isMarkerWithProfiling);
|
|
auto commandStreamStart = commandStream.getUsed();
|
|
|
|
if (this->context->getRootDeviceIndices().size() > 1) {
|
|
TimestampPacketHelper::programCsrDependenciesForForTaskCountContainer<GfxFamily>(commandStream, csrDeps);
|
|
}
|
|
|
|
if (enqueueWithBlitAuxTranslation) {
|
|
processDispatchForBlitAuxTranslation(*getBcsForAuxTranslation(), multiDispatchInfo, blitPropertiesContainer,
|
|
timestampPacketDependencies, eventsRequest, blockQueue);
|
|
}
|
|
|
|
if (eventBuilder.getEvent() && computeCommandStreamReceiver.peekTimestampPacketWriteEnabled()) {
|
|
eventBuilder.getEvent()->addTimestampPacketNodes(*timestampPacketContainer);
|
|
eventBuilder.getEvent()->addTimestampPacketNodes(timestampPacketDependencies.nonAuxToAuxNodes);
|
|
eventBuilder.getEvent()->addTimestampPacketNodes(timestampPacketDependencies.auxToNonAuxNodes);
|
|
}
|
|
|
|
bool flushDependenciesForNonKernelCommand = false;
|
|
|
|
if (multiDispatchInfo.empty() == false) {
|
|
processDispatchForKernels<commandType>(multiDispatchInfo, printfHandler, eventBuilder.getEvent(),
|
|
hwTimeStamps, blockQueue, csrDeps, blockedCommandsData.get(),
|
|
timestampPacketDependencies);
|
|
} else if (isCacheFlushCommand(commandType)) {
|
|
processDispatchForCacheFlush(surfacesForResidency, numSurfaceForResidency, &commandStream, csrDeps);
|
|
} else if (computeCommandStreamReceiver.peekTimestampPacketWriteEnabled()) {
|
|
if (CL_COMMAND_BARRIER == commandType) {
|
|
computeCommandStreamReceiver.requestStallingCommandsOnNextFlush();
|
|
}
|
|
|
|
for (size_t i = 0; i < eventsRequest.numEventsInWaitList; i++) {
|
|
auto waitlistEvent = castToObjectOrAbort<Event>(eventsRequest.eventWaitList[i]);
|
|
if (waitlistEvent->getTimestampPacketNodes()) {
|
|
flushDependenciesForNonKernelCommand = true;
|
|
if (eventBuilder.getEvent()) {
|
|
eventBuilder.getEvent()->addTimestampPacketNodes(*waitlistEvent->getTimestampPacketNodes());
|
|
}
|
|
}
|
|
}
|
|
|
|
if (isMarkerWithProfiling) {
|
|
flushDependenciesForNonKernelCommand = true;
|
|
}
|
|
|
|
if (flushDependenciesForNonKernelCommand) {
|
|
TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer<GfxFamily>(commandStream, csrDeps);
|
|
}
|
|
|
|
if (isMarkerWithProfiling) {
|
|
if (numEventsInWaitList == 0) {
|
|
computeCommandStreamReceiver.programComputeBarrierCommand(commandStream);
|
|
}
|
|
processDispatchForMarkerWithTimestampPacket(*this, &commandStream, eventsRequest, csrDeps);
|
|
}
|
|
} else if (isMarkerWithProfiling) {
|
|
processDispatchForMarker(*this, &commandStream, eventsRequest, csrDeps);
|
|
}
|
|
|
|
CompletionStamp completionStamp = {CompletionStamp::notReady, taskLevel, 0};
|
|
const EnqueueProperties enqueueProperties(false, !multiDispatchInfo.empty(), isCacheFlushCommand(commandType),
|
|
flushDependenciesForNonKernelCommand, isMarkerWithProfiling, &blitPropertiesContainer);
|
|
|
|
if (!blockQueue && isOOQEnabled()) {
|
|
setupBarrierTimestampForBcsEngines(computeCommandStreamReceiver.getOsContext().getEngineType(), timestampPacketDependencies);
|
|
}
|
|
|
|
bool migratedMemory = false;
|
|
|
|
if (!blockQueue && multiDispatchInfo.peekMainKernel() && multiDispatchInfo.peekMainKernel()->requiresMemoryMigration()) {
|
|
for (auto &arg : multiDispatchInfo.peekMainKernel()->getMemObjectsToMigrate()) {
|
|
MigrationController::handleMigration(*this->context, computeCommandStreamReceiver, arg.second);
|
|
migratedMemory = true;
|
|
}
|
|
}
|
|
if (!blockQueue) {
|
|
|
|
if (enqueueProperties.operation == EnqueueProperties::Operation::GpuKernel) {
|
|
csrDeps.makeResident(computeCommandStreamReceiver);
|
|
|
|
completionStamp = enqueueNonBlocked<commandType>(
|
|
surfacesForResidency,
|
|
numSurfaceForResidency,
|
|
commandStream,
|
|
commandStreamStart,
|
|
blocking,
|
|
clearDependenciesForSubCapture,
|
|
multiDispatchInfo,
|
|
enqueueProperties,
|
|
timestampPacketDependencies,
|
|
eventsRequest,
|
|
eventBuilder,
|
|
taskLevel,
|
|
printfHandler.get(),
|
|
getBcsForAuxTranslation());
|
|
} else if (enqueueProperties.isFlushWithoutKernelRequired()) {
|
|
completionStamp = enqueueCommandWithoutKernel(
|
|
surfacesForResidency,
|
|
numSurfaceForResidency,
|
|
&commandStream,
|
|
commandStreamStart,
|
|
blocking,
|
|
enqueueProperties,
|
|
timestampPacketDependencies,
|
|
eventsRequest,
|
|
eventBuilder,
|
|
taskLevel,
|
|
csrDeps,
|
|
nullptr);
|
|
} else {
|
|
UNRECOVERABLE_IF(enqueueProperties.operation != EnqueueProperties::Operation::EnqueueWithoutSubmission);
|
|
|
|
auto maxTaskCountCurrentRootDevice = this->taskCount;
|
|
|
|
for (auto eventId = 0u; eventId < numEventsInWaitList; eventId++) {
|
|
auto event = castToObject<Event>(eventWaitList[eventId]);
|
|
|
|
if (event->getCommandQueue() && event->getCommandQueue()->getDevice().getRootDeviceIndex() == this->getDevice().getRootDeviceIndex()) {
|
|
maxTaskCountCurrentRootDevice = std::max(maxTaskCountCurrentRootDevice, event->peekTaskCount());
|
|
}
|
|
}
|
|
|
|
//inherit data from event_wait_list and previous packets
|
|
completionStamp.flushStamp = this->flushStamp->peekStamp();
|
|
completionStamp.taskCount = maxTaskCountCurrentRootDevice;
|
|
completionStamp.taskLevel = taskLevel;
|
|
|
|
if (eventBuilder.getEvent() && isProfilingEnabled()) {
|
|
eventBuilder.getEvent()->setSubmitTimeStamp();
|
|
eventBuilder.getEvent()->setStartTimeStamp();
|
|
}
|
|
|
|
//check if we have BCS associated, if so we need to make sure it is completed as well
|
|
if (eventBuilder.getEvent() && this->bcsEngineTypes.size() > 0u) {
|
|
eventBuilder.getEvent()->setupBcs(this->getBcsCommandStreamReceiver(this->bcsEngineTypes[0u])->getOsContext().getEngineType());
|
|
}
|
|
}
|
|
if (eventBuilder.getEvent()) {
|
|
eventBuilder.getEvent()->flushStamp->replaceStampObject(this->flushStamp->getStampReference());
|
|
}
|
|
|
|
this->latestSentEnqueueType = enqueueProperties.operation;
|
|
}
|
|
updateFromCompletionStamp(completionStamp, eventBuilder.getEvent());
|
|
|
|
if (blockQueue) {
|
|
enqueueBlocked(commandType,
|
|
surfacesForResidency,
|
|
numSurfaceForResidency,
|
|
multiDispatchInfo,
|
|
timestampPacketDependencies,
|
|
blockedCommandsData,
|
|
enqueueProperties,
|
|
eventsRequest,
|
|
eventBuilder,
|
|
std::move(printfHandler),
|
|
nullptr);
|
|
}
|
|
|
|
if (deferredTimestampPackets.get()) {
|
|
timestampPacketDependencies.moveNodesToNewContainer(*deferredTimestampPackets);
|
|
}
|
|
|
|
queueOwnership.unlock();
|
|
commandStreamReceiverOwnership.unlock();
|
|
|
|
if (blocking) {
|
|
auto &builtinOpParams = multiDispatchInfo.peekBuiltinOpParams();
|
|
if (builtinOpParams.userPtrForPostOperationCpuCopy) {
|
|
waitForAllEngines(blockQueue, (blockQueue ? nullptr : printfHandler.get()), false);
|
|
auto hostPtrAlloc = builtinOpParams.transferAllocation;
|
|
UNRECOVERABLE_IF(nullptr == hostPtrAlloc);
|
|
auto size = hostPtrAlloc->getUnderlyingBufferSize();
|
|
[[maybe_unused]] int cpuCopyStatus = memcpy_s(builtinOpParams.userPtrForPostOperationCpuCopy, size, hostPtrAlloc->getUnderlyingBuffer(), size);
|
|
DEBUG_BREAK_IF(cpuCopyStatus != 0);
|
|
waitForAllEngines(blockQueue, (blockQueue ? nullptr : printfHandler.get()), true);
|
|
} else {
|
|
waitForAllEngines(blockQueue, (blockQueue ? nullptr : printfHandler.get()), true);
|
|
}
|
|
}
|
|
if (migratedMemory) {
|
|
computeCommandStreamReceiver.flushBatchedSubmissions();
|
|
}
|
|
}
|
|
|
|
template <typename GfxFamily>
|
|
template <uint32_t commandType>
|
|
void CommandQueueHw<GfxFamily>::processDispatchForKernels(const MultiDispatchInfo &multiDispatchInfo,
|
|
std::unique_ptr<PrintfHandler> &printfHandler,
|
|
Event *event,
|
|
TagNodeBase *&hwTimeStamps,
|
|
bool blockQueue,
|
|
CsrDependencies &csrDeps,
|
|
KernelOperation *blockedCommandsData,
|
|
TimestampPacketDependencies ×tampPacketDependencies) {
|
|
TagNodeBase *hwPerfCounter = nullptr;
|
|
getClFileLogger().dumpKernelArgs(&multiDispatchInfo);
|
|
|
|
printfHandler.reset(PrintfHandler::create(multiDispatchInfo, *device));
|
|
if (printfHandler) {
|
|
printfHandler->prepareDispatch(multiDispatchInfo);
|
|
}
|
|
|
|
if (multiDispatchInfo.peekMainKernel()->usesSyncBuffer()) {
|
|
auto &gws = multiDispatchInfo.begin()->getGWS();
|
|
auto &lws = multiDispatchInfo.begin()->getLocalWorkgroupSize();
|
|
size_t workGroupsCount = (gws.x * gws.y * gws.z) /
|
|
(lws.x * lws.y * lws.z);
|
|
device->getDevice().syncBufferHandler->prepareForEnqueue(workGroupsCount, *multiDispatchInfo.peekMainKernel());
|
|
}
|
|
|
|
if (commandType == CL_COMMAND_NDRANGE_KERNEL) {
|
|
if (multiDispatchInfo.peekMainKernel()->isKernelDebugEnabled()) {
|
|
setupDebugSurface(multiDispatchInfo.peekMainKernel());
|
|
}
|
|
}
|
|
|
|
if (event && this->isProfilingEnabled()) {
|
|
// Get allocation for timestamps
|
|
hwTimeStamps = event->getHwTimeStampNode();
|
|
}
|
|
|
|
if (event && this->isPerfCountersEnabled()) {
|
|
hwPerfCounter = event->getHwPerfCounterNode();
|
|
}
|
|
|
|
HardwareInterface<GfxFamily>::dispatchWalker(
|
|
*this,
|
|
multiDispatchInfo,
|
|
csrDeps,
|
|
blockedCommandsData,
|
|
hwTimeStamps,
|
|
hwPerfCounter,
|
|
×tampPacketDependencies,
|
|
timestampPacketContainer.get(),
|
|
commandType);
|
|
|
|
if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) {
|
|
for (auto &dispatchInfo : multiDispatchInfo) {
|
|
for (auto &patchInfoData : dispatchInfo.getKernel()->getPatchInfoDataList()) {
|
|
getGpgpuCommandStreamReceiver().getFlatBatchBufferHelper().setPatchInfoData(patchInfoData);
|
|
}
|
|
}
|
|
}
|
|
|
|
getGpgpuCommandStreamReceiver().setRequiredScratchSizes(multiDispatchInfo.getRequiredScratchSize(), multiDispatchInfo.getRequiredPrivateScratchSize());
|
|
}
|
|
|
|
template <typename GfxFamily>
|
|
BlitProperties CommandQueueHw<GfxFamily>::processDispatchForBlitEnqueue(CommandStreamReceiver &blitCommandStreamReceiver,
|
|
const MultiDispatchInfo &multiDispatchInfo,
|
|
TimestampPacketDependencies ×tampPacketDependencies,
|
|
const EventsRequest &eventsRequest, LinearStream *commandStream,
|
|
uint32_t commandType, bool queueBlocked) {
|
|
auto blitDirection = ClBlitProperties::obtainBlitDirection(commandType);
|
|
|
|
auto blitProperties = ClBlitProperties::constructProperties(blitDirection, blitCommandStreamReceiver,
|
|
multiDispatchInfo.peekBuiltinOpParams());
|
|
if (!queueBlocked) {
|
|
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(blitProperties.csrDependencies, blitCommandStreamReceiver,
|
|
CsrDependencies::DependenciesType::All);
|
|
|
|
blitProperties.csrDependencies.timestampPacketContainer.push_back(×tampPacketDependencies.cacheFlushNodes);
|
|
blitProperties.csrDependencies.timestampPacketContainer.push_back(×tampPacketDependencies.previousEnqueueNodes);
|
|
blitProperties.csrDependencies.timestampPacketContainer.push_back(×tampPacketDependencies.barrierNodes);
|
|
}
|
|
|
|
auto currentTimestampPacketNode = timestampPacketContainer->peekNodes().at(0);
|
|
blitProperties.outputTimestampPacket = currentTimestampPacketNode;
|
|
|
|
if (commandStream) {
|
|
if (timestampPacketDependencies.cacheFlushNodes.peekNodes().size() > 0) {
|
|
auto cacheFlushTimestampPacketGpuAddress = TimestampPacketHelper::getContextEndGpuAddress(*timestampPacketDependencies.cacheFlushNodes.peekNodes()[0]);
|
|
const auto &hwInfo = device->getHardwareInfo();
|
|
PipeControlArgs args;
|
|
args.dcFlushEnable = MemorySynchronizationCommands<GfxFamily>::getDcFlushEnable(true, hwInfo);
|
|
MemorySynchronizationCommands<GfxFamily>::addPipeControlAndProgramPostSyncOperation(
|
|
*commandStream,
|
|
GfxFamily::PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
|
|
cacheFlushTimestampPacketGpuAddress,
|
|
0,
|
|
hwInfo,
|
|
args);
|
|
}
|
|
}
|
|
return blitProperties;
|
|
}
|
|
|
|
template <typename GfxFamily>
|
|
void CommandQueueHw<GfxFamily>::processDispatchForBlitAuxTranslation(CommandStreamReceiver &bcsCsr,
|
|
const MultiDispatchInfo &multiDispatchInfo,
|
|
BlitPropertiesContainer &blitPropertiesContainer,
|
|
TimestampPacketDependencies ×tampPacketDependencies,
|
|
const EventsRequest &eventsRequest, bool queueBlocked) {
|
|
auto rootDeviceIndex = getDevice().getRootDeviceIndex();
|
|
auto nodesAllocator = getGpgpuCommandStreamReceiver().getTimestampPacketAllocator();
|
|
auto numKernelObjs = multiDispatchInfo.getKernelObjsForAuxTranslation()->size();
|
|
blitPropertiesContainer.resize(numKernelObjs * 2);
|
|
|
|
auto bufferIndex = 0;
|
|
for (auto &kernelObj : *multiDispatchInfo.getKernelObjsForAuxTranslation()) {
|
|
GraphicsAllocation *allocation = nullptr;
|
|
if (kernelObj.type == KernelObjForAuxTranslation::Type::MEM_OBJ) {
|
|
auto buffer = static_cast<Buffer *>(kernelObj.object);
|
|
allocation = buffer->getGraphicsAllocation(rootDeviceIndex);
|
|
} else {
|
|
DEBUG_BREAK_IF(kernelObj.type != KernelObjForAuxTranslation::Type::GFX_ALLOC);
|
|
allocation = static_cast<GraphicsAllocation *>(kernelObj.object);
|
|
}
|
|
{
|
|
// Aux to NonAux
|
|
blitPropertiesContainer[bufferIndex] = BlitProperties::constructPropertiesForAuxTranslation(
|
|
AuxTranslationDirection::AuxToNonAux, allocation, getGpgpuCommandStreamReceiver().getClearColorAllocation());
|
|
auto auxToNonAuxNode = nodesAllocator->getTag();
|
|
timestampPacketDependencies.auxToNonAuxNodes.add(auxToNonAuxNode);
|
|
}
|
|
|
|
{
|
|
// NonAux to Aux
|
|
blitPropertiesContainer[bufferIndex + numKernelObjs] = BlitProperties::constructPropertiesForAuxTranslation(
|
|
AuxTranslationDirection::NonAuxToAux, allocation, getGpgpuCommandStreamReceiver().getClearColorAllocation());
|
|
auto nonAuxToAuxNode = nodesAllocator->getTag();
|
|
timestampPacketDependencies.nonAuxToAuxNodes.add(nonAuxToAuxNode);
|
|
}
|
|
bufferIndex++;
|
|
}
|
|
|
|
if (!queueBlocked) {
|
|
CsrDependencies csrDeps;
|
|
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, bcsCsr, CsrDependencies::DependenciesType::All);
|
|
BlitProperties::setupDependenciesForAuxTranslation(blitPropertiesContainer, timestampPacketDependencies,
|
|
*this->timestampPacketContainer, csrDeps,
|
|
getGpgpuCommandStreamReceiver(), bcsCsr);
|
|
}
|
|
|
|
eventsRequest.setupBcsCsrForOutputEvent(bcsCsr);
|
|
}
|
|
|
|
template <typename GfxFamily>
|
|
void CommandQueueHw<GfxFamily>::processDispatchForCacheFlush(Surface **surfaces,
|
|
size_t numSurfaces,
|
|
LinearStream *commandStream,
|
|
CsrDependencies &csrDeps) {
|
|
|
|
TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer<GfxFamily>(*commandStream, csrDeps);
|
|
|
|
uint64_t postSyncAddress = 0;
|
|
if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
|
|
auto timestampPacketNodeForPostSync = timestampPacketContainer->peekNodes().at(0);
|
|
timestampPacketNodeForPostSync->setProfilingCapable(false);
|
|
postSyncAddress = TimestampPacketHelper::getContextEndGpuAddress(*timestampPacketNodeForPostSync);
|
|
}
|
|
|
|
submitCacheFlush(surfaces, numSurfaces, commandStream, postSyncAddress);
|
|
}
|
|
|
|
template <typename GfxFamily>
|
|
void CommandQueueHw<GfxFamily>::processDispatchForMarker(CommandQueue &commandQueue,
|
|
LinearStream *commandStream,
|
|
EventsRequest &eventsRequest,
|
|
CsrDependencies &csrDeps) {
|
|
auto event = castToObjectOrAbort<Event>(*eventsRequest.outEvent);
|
|
|
|
TagNodeBase *hwTimeStamps = nullptr;
|
|
TagNodeBase *hwPerfCounter = nullptr;
|
|
|
|
hwTimeStamps = event->getHwTimeStampNode();
|
|
|
|
HardwareInterface<GfxFamily>::dispatchProfilingPerfStartCommands(hwTimeStamps, hwPerfCounter, commandStream, commandQueue);
|
|
HardwareInterface<GfxFamily>::dispatchProfilingPerfEndCommands(hwTimeStamps, hwPerfCounter, commandStream, commandQueue);
|
|
getGpgpuCommandStreamReceiver().makeResident(*hwTimeStamps->getBaseGraphicsAllocation());
|
|
}
|
|
|
|
template <typename GfxFamily>
|
|
void CommandQueueHw<GfxFamily>::processDispatchForMarkerWithTimestampPacket(CommandQueue &commandQueue,
|
|
LinearStream *commandStream,
|
|
EventsRequest &eventsRequest,
|
|
CsrDependencies &csrDeps) {
|
|
auto currentTimestampPacketNode = commandQueue.getTimestampPacketContainer()->peekNodes().at(0);
|
|
|
|
auto timestampContextStartGpuAddress = TimestampPacketHelper::getContextStartGpuAddress(*currentTimestampPacketNode);
|
|
auto timestampGlobalStartAddress = TimestampPacketHelper::getGlobalStartGpuAddress(*currentTimestampPacketNode);
|
|
|
|
EncodeStoreMMIO<GfxFamily>::encode(*commandStream, GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, timestampContextStartGpuAddress);
|
|
EncodeStoreMMIO<GfxFamily>::encode(*commandStream, REG_GLOBAL_TIMESTAMP_LDW, timestampGlobalStartAddress);
|
|
|
|
auto timestampContextEndGpuAddress = TimestampPacketHelper::getContextEndGpuAddress(*currentTimestampPacketNode);
|
|
auto timestampGlobalEndAddress = TimestampPacketHelper::getGlobalEndGpuAddress(*currentTimestampPacketNode);
|
|
|
|
EncodeStoreMMIO<GfxFamily>::encode(*commandStream, GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, timestampContextEndGpuAddress);
|
|
EncodeStoreMMIO<GfxFamily>::encode(*commandStream, REG_GLOBAL_TIMESTAMP_LDW, timestampGlobalEndAddress);
|
|
}
|
|
|
|
template <typename GfxFamily>
|
|
void CommandQueueHw<GfxFamily>::obtainTaskLevelAndBlockedStatus(unsigned int &taskLevel, cl_uint &numEventsInWaitList, const cl_event *&eventWaitList, bool &blockQueueStatus, unsigned int commandType) {
|
|
auto isQueueBlockedStatus = isQueueBlocked();
|
|
taskLevel = getTaskLevelFromWaitList(this->taskLevel, numEventsInWaitList, eventWaitList);
|
|
blockQueueStatus = (taskLevel == CompletionStamp::notReady) || isQueueBlockedStatus;
|
|
|
|
auto taskLevelUpdateRequired = isTaskLevelUpdateRequired(taskLevel, eventWaitList, numEventsInWaitList, commandType);
|
|
if (taskLevelUpdateRequired) {
|
|
taskLevel++;
|
|
this->taskLevel = taskLevel;
|
|
}
|
|
|
|
DBG_LOG(EventsDebugEnable, "blockQueue", blockQueueStatus, "virtualEvent", virtualEvent, "taskLevel", taskLevel);
|
|
}
|
|
|
|
template <typename GfxFamily>
|
|
bool CommandQueueHw<GfxFamily>::isTaskLevelUpdateRequired(const uint32_t &taskLevel, const cl_event *eventWaitList, const cl_uint &numEventsInWaitList, unsigned int commandType) {
|
|
bool updateTaskLevel = true;
|
|
//if we are blocked by user event then no update
|
|
if (taskLevel == CompletionStamp::notReady) {
|
|
updateTaskLevel = false;
|
|
}
|
|
//if we are executing command without kernel then it will inherit state from
|
|
//previous commands, barrier is exception
|
|
if (isCommandWithoutKernel(commandType) && commandType != CL_COMMAND_BARRIER) {
|
|
updateTaskLevel = false;
|
|
}
|
|
//ooq special cases starts here
|
|
if (this->isOOQEnabled()) {
|
|
//if no wait list and barrier , do not update task level
|
|
if (eventWaitList == nullptr && commandType != CL_COMMAND_BARRIER) {
|
|
updateTaskLevel = false;
|
|
}
|
|
//if we have waitlist then deduce task level from waitlist and check if it is higher then current task level of queue
|
|
if (eventWaitList != nullptr) {
|
|
auto taskLevelFromEvents = getTaskLevelFromWaitList(0, numEventsInWaitList, eventWaitList);
|
|
taskLevelFromEvents++;
|
|
if (taskLevelFromEvents <= this->taskLevel) {
|
|
updateTaskLevel = false;
|
|
}
|
|
}
|
|
}
|
|
return updateTaskLevel;
|
|
}
|
|
|
|
template <typename GfxFamily>
|
|
template <uint32_t commandType>
|
|
CompletionStamp CommandQueueHw<GfxFamily>::enqueueNonBlocked(
|
|
Surface **surfaces,
|
|
size_t surfaceCount,
|
|
LinearStream &commandStream,
|
|
size_t commandStreamStart,
|
|
bool &blocking,
|
|
bool clearDependenciesForSubCapture,
|
|
const MultiDispatchInfo &multiDispatchInfo,
|
|
const EnqueueProperties &enqueueProperties,
|
|
TimestampPacketDependencies ×tampPacketDependencies,
|
|
EventsRequest &eventsRequest,
|
|
EventBuilder &eventBuilder,
|
|
uint32_t taskLevel,
|
|
PrintfHandler *printfHandler,
|
|
CommandStreamReceiver *bcsCsr) {
|
|
|
|
UNRECOVERABLE_IF(multiDispatchInfo.empty());
|
|
|
|
auto implicitFlush = false;
|
|
|
|
if (printfHandler) {
|
|
blocking = true;
|
|
printfHandler->makeResident(getGpgpuCommandStreamReceiver());
|
|
}
|
|
|
|
if (multiDispatchInfo.peekMainKernel()->usesSyncBuffer()) {
|
|
device->getDevice().syncBufferHandler->makeResident(getGpgpuCommandStreamReceiver());
|
|
}
|
|
|
|
if (timestampPacketContainer) {
|
|
timestampPacketContainer->makeResident(getGpgpuCommandStreamReceiver());
|
|
timestampPacketDependencies.previousEnqueueNodes.makeResident(getGpgpuCommandStreamReceiver());
|
|
timestampPacketDependencies.cacheFlushNodes.makeResident(getGpgpuCommandStreamReceiver());
|
|
}
|
|
|
|
bool anyUncacheableArgs = false;
|
|
auto requiresCoherency = false;
|
|
for (auto surface : CreateRange(surfaces, surfaceCount)) {
|
|
surface->makeResident(getGpgpuCommandStreamReceiver());
|
|
requiresCoherency |= surface->IsCoherent;
|
|
if (!surface->allowsL3Caching()) {
|
|
anyUncacheableArgs = true;
|
|
}
|
|
}
|
|
|
|
auto mediaSamplerRequired = false;
|
|
uint32_t numGrfRequired = GrfConfig::DefaultGrfNumber;
|
|
auto specialPipelineSelectMode = false;
|
|
Kernel *kernel = nullptr;
|
|
bool auxTranslationRequired = false;
|
|
bool useGlobalAtomics = false;
|
|
|
|
for (auto &dispatchInfo : multiDispatchInfo) {
|
|
if (kernel != dispatchInfo.getKernel()) {
|
|
kernel = dispatchInfo.getKernel();
|
|
} else {
|
|
continue;
|
|
}
|
|
kernel->makeResident(getGpgpuCommandStreamReceiver());
|
|
requiresCoherency |= kernel->requiresCoherency();
|
|
mediaSamplerRequired |= kernel->isVmeKernel();
|
|
auto numGrfRequiredByKernel = static_cast<uint32_t>(kernel->getKernelInfo().kernelDescriptor.kernelAttributes.numGrfRequired);
|
|
numGrfRequired = std::max(numGrfRequired, numGrfRequiredByKernel);
|
|
specialPipelineSelectMode |= kernel->requiresSpecialPipelineSelectMode();
|
|
auxTranslationRequired |= kernel->isAuxTranslationRequired();
|
|
if (kernel->hasUncacheableStatelessArgs()) {
|
|
anyUncacheableArgs = true;
|
|
}
|
|
|
|
if (kernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics) {
|
|
useGlobalAtomics = true;
|
|
}
|
|
}
|
|
|
|
if (mediaSamplerRequired) {
|
|
DEBUG_BREAK_IF(device->getDeviceInfo().preemptionSupported != false);
|
|
}
|
|
|
|
if (isProfilingEnabled() && eventBuilder.getEvent()) {
|
|
eventBuilder.getEvent()->setSubmitTimeStamp();
|
|
|
|
auto hwTimestampNode = eventBuilder.getEvent()->getHwTimeStampNode();
|
|
if (hwTimestampNode) {
|
|
getGpgpuCommandStreamReceiver().makeResident(*hwTimestampNode->getBaseGraphicsAllocation());
|
|
}
|
|
|
|
if (isPerfCountersEnabled()) {
|
|
getGpgpuCommandStreamReceiver().makeResident(*eventBuilder.getEvent()->getHwPerfCounterNode()->getBaseGraphicsAllocation());
|
|
}
|
|
}
|
|
|
|
IndirectHeap *dsh = nullptr;
|
|
IndirectHeap *ioh = nullptr;
|
|
|
|
dsh = &getIndirectHeap(IndirectHeap::Type::DYNAMIC_STATE, 0u);
|
|
ioh = &getIndirectHeap(IndirectHeap::Type::INDIRECT_OBJECT, 0u);
|
|
|
|
auto allocNeedsFlushDC = false;
|
|
if (!device->isFullRangeSvm()) {
|
|
if (std::any_of(getGpgpuCommandStreamReceiver().getResidencyAllocations().begin(), getGpgpuCommandStreamReceiver().getResidencyAllocations().end(), [](const auto allocation) { return allocation->isFlushL3Required(); })) {
|
|
allocNeedsFlushDC = true;
|
|
}
|
|
}
|
|
|
|
auto memoryCompressionState = getGpgpuCommandStreamReceiver().getMemoryCompressionState(auxTranslationRequired, device->getHardwareInfo());
|
|
|
|
DispatchFlags dispatchFlags(
|
|
{}, //csrDependencies
|
|
×tampPacketDependencies.barrierNodes, //barrierTimestampPacketNodes
|
|
{}, //pipelineSelectArgs
|
|
this->flushStamp->getStampReference(), //flushStampReference
|
|
getThrottle(), //throttle
|
|
ClPreemptionHelper::taskPreemptionMode(getDevice(), multiDispatchInfo), //preemptionMode
|
|
numGrfRequired, //numGrfRequired
|
|
L3CachingSettings::l3CacheOn, //l3CacheSettings
|
|
kernel->getThreadArbitrationPolicy(), //threadArbitrationPolicy
|
|
kernel->getAdditionalKernelExecInfo(), //additionalKernelExecInfo
|
|
kernel->getExecutionType(), //kernelExecutionType
|
|
memoryCompressionState, //memoryCompressionState
|
|
getSliceCount(), //sliceCount
|
|
blocking, //blocking
|
|
shouldFlushDC(commandType, printfHandler) || allocNeedsFlushDC, //dcFlush
|
|
multiDispatchInfo.usesSlm(), //useSLM
|
|
true, //guardCommandBufferWithPipeControl
|
|
commandType == CL_COMMAND_NDRANGE_KERNEL, //GSBA32BitRequired
|
|
requiresCoherency, //requiresCoherency
|
|
(QueuePriority::LOW == priority), //lowPriority
|
|
implicitFlush, //implicitFlush
|
|
!eventBuilder.getEvent() || getGpgpuCommandStreamReceiver().isNTo1SubmissionModelEnabled(), //outOfOrderExecutionAllowed
|
|
false, //epilogueRequired
|
|
false, //usePerDssBackedBuffer
|
|
kernel->isSingleSubdevicePreferred(), //useSingleSubdevice
|
|
useGlobalAtomics, //useGlobalAtomics
|
|
kernel->areMultipleSubDevicesInContext(), //areMultipleSubDevicesInContext
|
|
kernel->requiresMemoryMigration(), //memoryMigrationRequired
|
|
isTextureCacheFlushNeeded(commandType)); //textureCacheFlush
|
|
|
|
dispatchFlags.pipelineSelectArgs.mediaSamplerRequired = mediaSamplerRequired;
|
|
dispatchFlags.pipelineSelectArgs.specialPipelineSelectMode = specialPipelineSelectMode;
|
|
|
|
dispatchFlags.disableEUFusion = kernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.requiresDisabledEUFusion;
|
|
|
|
const bool isHandlingBarrier = getGpgpuCommandStreamReceiver().isStallingCommandsOnNextFlushRequired();
|
|
|
|
if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled() && !clearDependenciesForSubCapture) {
|
|
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(dispatchFlags.csrDependencies, getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OutOfCsr);
|
|
if (isHandlingBarrier) {
|
|
fillCsrDependenciesWithLastBcsPackets(dispatchFlags.csrDependencies);
|
|
}
|
|
dispatchFlags.csrDependencies.makeResident(getGpgpuCommandStreamReceiver());
|
|
}
|
|
|
|
DEBUG_BREAK_IF(taskLevel >= CompletionStamp::notReady);
|
|
|
|
if (anyUncacheableArgs) {
|
|
dispatchFlags.l3CacheSettings = L3CachingSettings::l3CacheOff;
|
|
} else if (!kernel->areStatelessWritesUsed()) {
|
|
dispatchFlags.l3CacheSettings = L3CachingSettings::l3AndL1On;
|
|
}
|
|
|
|
if (this->dispatchHints != 0) {
|
|
dispatchFlags.engineHints = this->dispatchHints;
|
|
dispatchFlags.epilogueRequired = true;
|
|
}
|
|
|
|
if (gtpinIsGTPinInitialized()) {
|
|
gtpinNotifyPreFlushTask(this);
|
|
}
|
|
|
|
if (enqueueProperties.blitPropertiesContainer->size() > 0) {
|
|
const auto newTaskCount = bcsCsr->flushBcsTask(*enqueueProperties.blitPropertiesContainer, false, this->isProfilingEnabled(), getDevice());
|
|
this->updateBcsTaskCount(bcsCsr->getOsContext().getEngineType(), newTaskCount);
|
|
dispatchFlags.implicitFlush = true;
|
|
}
|
|
|
|
PRINT_DEBUG_STRING(DebugManager.flags.PrintDebugMessages.get(), stdout, "preemption = %d.\n", static_cast<int>(dispatchFlags.preemptionMode));
|
|
CompletionStamp completionStamp = getGpgpuCommandStreamReceiver().flushTask(
|
|
commandStream,
|
|
commandStreamStart,
|
|
*dsh,
|
|
*ioh,
|
|
getIndirectHeap(IndirectHeap::Type::SURFACE_STATE, 0u),
|
|
taskLevel,
|
|
dispatchFlags,
|
|
getDevice());
|
|
|
|
if (isHandlingBarrier) {
|
|
clearLastBcsPackets();
|
|
}
|
|
|
|
if (gtpinIsGTPinInitialized()) {
|
|
gtpinNotifyFlushTask(completionStamp.taskCount);
|
|
}
|
|
|
|
return completionStamp;
|
|
}
|
|
|
|
template <typename GfxFamily>
|
|
void CommandQueueHw<GfxFamily>::enqueueBlocked(
|
|
uint32_t commandType,
|
|
Surface **surfaces,
|
|
size_t surfaceCount,
|
|
const MultiDispatchInfo &multiDispatchInfo,
|
|
TimestampPacketDependencies ×tampPacketDependencies,
|
|
std::unique_ptr<KernelOperation> &blockedCommandsData,
|
|
const EnqueueProperties &enqueueProperties,
|
|
EventsRequest &eventsRequest,
|
|
EventBuilder &externalEventBuilder,
|
|
std::unique_ptr<PrintfHandler> &&printfHandler,
|
|
CommandStreamReceiver *bcsCsr) {
|
|
|
|
TakeOwnershipWrapper<CommandQueueHw<GfxFamily>> queueOwnership(*this);
|
|
|
|
//store previous virtual event as it will add dependecies to new virtual event
|
|
if (this->virtualEvent) {
|
|
DBG_LOG(EventsDebugEnable, "enqueueBlocked", "previousVirtualEvent", this->virtualEvent);
|
|
}
|
|
|
|
EventBuilder internalEventBuilder;
|
|
EventBuilder *eventBuilder;
|
|
// check if event will be exposed externally
|
|
if (externalEventBuilder.getEvent()) {
|
|
externalEventBuilder.getEvent()->incRefInternal();
|
|
eventBuilder = &externalEventBuilder;
|
|
DBG_LOG(EventsDebugEnable, "enqueueBlocked", "output event as virtualEvent", virtualEvent);
|
|
} else {
|
|
// it will be an internal event
|
|
internalEventBuilder.create<VirtualEvent>(this, context);
|
|
eventBuilder = &internalEventBuilder;
|
|
DBG_LOG(EventsDebugEnable, "enqueueBlocked", "new virtualEvent", eventBuilder->getEvent());
|
|
}
|
|
auto outEvent = eventBuilder->getEvent();
|
|
|
|
//update queue taskCount
|
|
taskCount = outEvent->getCompletionStamp();
|
|
|
|
std::unique_ptr<Command> command;
|
|
bool storeTimestampPackets = false;
|
|
|
|
if (blockedCommandsData) {
|
|
if (enqueueProperties.blitPropertiesContainer) {
|
|
blockedCommandsData->blitPropertiesContainer = *enqueueProperties.blitPropertiesContainer;
|
|
blockedCommandsData->bcsCsr = bcsCsr;
|
|
blockedCommandsData->blitEnqueue = true;
|
|
}
|
|
|
|
storeTimestampPackets = (timestampPacketContainer != nullptr);
|
|
}
|
|
|
|
if (enqueueProperties.operation != EnqueueProperties::Operation::GpuKernel) {
|
|
command = std::make_unique<CommandWithoutKernel>(*this, blockedCommandsData);
|
|
} else {
|
|
//store task data in event
|
|
std::vector<Surface *> allSurfaces;
|
|
Kernel *kernel = nullptr;
|
|
for (auto &dispatchInfo : multiDispatchInfo) {
|
|
if (kernel != dispatchInfo.getKernel()) {
|
|
kernel = dispatchInfo.getKernel();
|
|
} else {
|
|
continue;
|
|
}
|
|
kernel->getResidency(allSurfaces);
|
|
}
|
|
|
|
allSurfaces.reserve(allSurfaces.size() + surfaceCount);
|
|
for (auto &surface : CreateRange(surfaces, surfaceCount)) {
|
|
allSurfaces.push_back(surface->duplicate());
|
|
}
|
|
|
|
PreemptionMode preemptionMode = ClPreemptionHelper::taskPreemptionMode(getDevice(), multiDispatchInfo);
|
|
bool slmUsed = multiDispatchInfo.usesSlm();
|
|
command = std::make_unique<CommandComputeKernel>(*this,
|
|
blockedCommandsData,
|
|
std::move(allSurfaces),
|
|
shouldFlushDC(commandType, printfHandler.get()),
|
|
slmUsed,
|
|
commandType,
|
|
std::move(printfHandler),
|
|
preemptionMode,
|
|
multiDispatchInfo.peekMainKernel(),
|
|
(uint32_t)multiDispatchInfo.size());
|
|
}
|
|
if (storeTimestampPackets) {
|
|
command->setTimestampPacketNode(*timestampPacketContainer, std::move(timestampPacketDependencies));
|
|
command->setEventsRequest(eventsRequest);
|
|
} else if (this->context->getRootDeviceIndices().size() > 1) {
|
|
command->setEventsRequest(eventsRequest);
|
|
}
|
|
|
|
outEvent->setCommand(std::move(command));
|
|
|
|
eventBuilder->addParentEvents(ArrayRef<const cl_event>(eventsRequest.eventWaitList, eventsRequest.numEventsInWaitList));
|
|
eventBuilder->addParentEvent(this->virtualEvent);
|
|
eventBuilder->finalize();
|
|
|
|
if (this->virtualEvent) {
|
|
this->virtualEvent->decRefInternal();
|
|
}
|
|
|
|
this->virtualEvent = outEvent;
|
|
}
|
|
|
|
template <typename GfxFamily>
|
|
CompletionStamp CommandQueueHw<GfxFamily>::enqueueCommandWithoutKernel(
|
|
Surface **surfaces,
|
|
size_t surfaceCount,
|
|
LinearStream *commandStream,
|
|
size_t commandStreamStart,
|
|
bool &blocking,
|
|
const EnqueueProperties &enqueueProperties,
|
|
TimestampPacketDependencies ×tampPacketDependencies,
|
|
EventsRequest &eventsRequest,
|
|
EventBuilder &eventBuilder,
|
|
uint32_t taskLevel,
|
|
CsrDependencies &csrDeps,
|
|
CommandStreamReceiver *bcsCsr) {
|
|
|
|
CompletionStamp completionStamp = {this->taskCount, this->taskLevel, this->flushStamp->peekStamp()};
|
|
bool flushGpgpuCsr = true;
|
|
|
|
if ((enqueueProperties.operation == EnqueueProperties::Operation::Blit) && !isGpgpuSubmissionForBcsRequired(false, timestampPacketDependencies)) {
|
|
flushGpgpuCsr = false;
|
|
} else {
|
|
csrDeps.makeResident(getGpgpuCommandStreamReceiver());
|
|
}
|
|
|
|
if (eventBuilder.getEvent() && isProfilingEnabled()) {
|
|
eventBuilder.getEvent()->setSubmitTimeStamp();
|
|
eventBuilder.getEvent()->setStartTimeStamp();
|
|
}
|
|
|
|
if (flushGpgpuCsr) {
|
|
if (timestampPacketContainer) {
|
|
timestampPacketContainer->makeResident(getGpgpuCommandStreamReceiver());
|
|
timestampPacketDependencies.previousEnqueueNodes.makeResident(getGpgpuCommandStreamReceiver());
|
|
timestampPacketDependencies.cacheFlushNodes.makeResident(getGpgpuCommandStreamReceiver());
|
|
}
|
|
|
|
for (auto surface : CreateRange(surfaces, surfaceCount)) {
|
|
surface->makeResident(getGpgpuCommandStreamReceiver());
|
|
}
|
|
|
|
auto rootDeviceIndex = getDevice().getRootDeviceIndex();
|
|
DispatchFlags dispatchFlags(
|
|
{}, //csrDependencies
|
|
×tampPacketDependencies.barrierNodes, //barrierTimestampPacketNodes
|
|
{}, //pipelineSelectArgs
|
|
flushStamp->getStampReference(), //flushStampReference
|
|
getThrottle(), //throttle
|
|
device->getPreemptionMode(), //preemptionMode
|
|
GrfConfig::NotApplicable, //numGrfRequired
|
|
L3CachingSettings::NotApplicable, //l3CacheSettings
|
|
ThreadArbitrationPolicy::NotPresent, //threadArbitrationPolicy
|
|
AdditionalKernelExecInfo::NotApplicable, //additionalKernelExecInfo
|
|
KernelExecutionType::NotApplicable, //kernelExecutionType
|
|
MemoryCompressionState::NotApplicable, //memoryCompressionState
|
|
getSliceCount(), //sliceCount
|
|
blocking, //blocking
|
|
false, //dcFlush
|
|
false, //useSLM
|
|
true, //guardCommandBufferWithPipeControl
|
|
false, //GSBA32BitRequired
|
|
false, //requiresCoherency
|
|
false, //lowPriority
|
|
(enqueueProperties.operation == EnqueueProperties::Operation::Blit), //implicitFlush
|
|
getGpgpuCommandStreamReceiver().isNTo1SubmissionModelEnabled(), //outOfOrderExecutionAllowed
|
|
false, //epilogueRequired
|
|
false, //usePerDssBackedBuffer
|
|
false, //useSingleSubdevice
|
|
false, //useGlobalAtomics
|
|
context->containsMultipleSubDevices(rootDeviceIndex), //areMultipleSubDevicesInContext
|
|
false, //memoryMigrationRequired
|
|
false); //textureCacheFlush
|
|
|
|
const bool isHandlingBarrier = getGpgpuCommandStreamReceiver().isStallingCommandsOnNextFlushRequired();
|
|
|
|
if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
|
|
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(dispatchFlags.csrDependencies, getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OutOfCsr);
|
|
if (isHandlingBarrier) {
|
|
fillCsrDependenciesWithLastBcsPackets(dispatchFlags.csrDependencies);
|
|
}
|
|
dispatchFlags.csrDependencies.makeResident(getGpgpuCommandStreamReceiver());
|
|
}
|
|
|
|
completionStamp = getGpgpuCommandStreamReceiver().flushTask(
|
|
*commandStream,
|
|
commandStreamStart,
|
|
getIndirectHeap(IndirectHeap::Type::DYNAMIC_STATE, 0u),
|
|
getIndirectHeap(IndirectHeap::Type::INDIRECT_OBJECT, 0u),
|
|
getIndirectHeap(IndirectHeap::Type::SURFACE_STATE, 0u),
|
|
taskLevel,
|
|
dispatchFlags,
|
|
getDevice());
|
|
|
|
if (isHandlingBarrier) {
|
|
clearLastBcsPackets();
|
|
}
|
|
}
|
|
|
|
if (enqueueProperties.operation == EnqueueProperties::Operation::Blit) {
|
|
UNRECOVERABLE_IF(!enqueueProperties.blitPropertiesContainer);
|
|
const auto newTaskCount = bcsCsr->flushBcsTask(*enqueueProperties.blitPropertiesContainer, false, this->isProfilingEnabled(), getDevice());
|
|
this->updateBcsTaskCount(bcsCsr->getOsContext().getEngineType(), newTaskCount);
|
|
}
|
|
|
|
return completionStamp;
|
|
}
|
|
|
|
template <typename GfxFamily>
|
|
void CommandQueueHw<GfxFamily>::computeOffsetsValueForRectCommands(size_t *bufferOffset,
|
|
size_t *hostOffset,
|
|
const size_t *bufferOrigin,
|
|
const size_t *hostOrigin,
|
|
const size_t *region,
|
|
size_t bufferRowPitch,
|
|
size_t bufferSlicePitch,
|
|
size_t hostRowPitch,
|
|
size_t hostSlicePitch) {
|
|
size_t computedBufferRowPitch = bufferRowPitch ? bufferRowPitch : region[0];
|
|
size_t computedBufferSlicePitch = bufferSlicePitch ? bufferSlicePitch : region[1] * computedBufferRowPitch;
|
|
size_t computedHostRowPitch = hostRowPitch ? hostRowPitch : region[0];
|
|
size_t computedHostSlicePitch = hostSlicePitch ? hostSlicePitch : region[1] * computedHostRowPitch;
|
|
*bufferOffset = bufferOrigin[2] * computedBufferSlicePitch + bufferOrigin[1] * computedBufferRowPitch + bufferOrigin[0];
|
|
*hostOffset = hostOrigin[2] * computedHostSlicePitch + hostOrigin[1] * computedHostRowPitch + hostOrigin[0];
|
|
}
|
|
|
|
template <typename GfxFamily>
|
|
size_t CommandQueueHw<GfxFamily>::calculateHostPtrSizeForImage(const size_t *region, size_t rowPitch, size_t slicePitch, Image *image) {
|
|
auto bytesPerPixel = image->getSurfaceFormatInfo().surfaceFormat.ImageElementSizeInBytes;
|
|
auto dstRowPitch = rowPitch ? rowPitch : region[0] * bytesPerPixel;
|
|
auto dstSlicePitch = slicePitch ? slicePitch : ((image->getImageDesc().image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY ? 1 : region[1]) * dstRowPitch);
|
|
|
|
return Image::calculateHostPtrSize(region, dstRowPitch, dstSlicePitch, bytesPerPixel, image->getImageDesc().image_type);
|
|
}
|
|
|
|
template <typename GfxFamily>
|
|
template <uint32_t cmdType>
|
|
void CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDispatchInfo, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event, bool blocking, CommandStreamReceiver &bcsCsr) {
|
|
auto bcsCommandStreamReceiverOwnership = bcsCsr.obtainUniqueOwnership();
|
|
std::unique_lock<NEO::CommandStreamReceiver::MutexType> commandStreamReceiverOwnership;
|
|
|
|
EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, event);
|
|
EventBuilder eventBuilder;
|
|
|
|
setupEvent(eventBuilder, eventsRequest.outEvent, cmdType);
|
|
eventsRequest.setupBcsCsrForOutputEvent(bcsCsr);
|
|
|
|
std::unique_ptr<KernelOperation> blockedCommandsData;
|
|
TakeOwnershipWrapper<CommandQueueHw<GfxFamily>> queueOwnership(*this);
|
|
|
|
auto blockQueue = false;
|
|
auto taskLevel = 0u;
|
|
obtainTaskLevelAndBlockedStatus(taskLevel, eventsRequest.numEventsInWaitList, eventsRequest.eventWaitList, blockQueue, cmdType);
|
|
auto clearAllDependencies = queueDependenciesClearRequired();
|
|
|
|
enqueueHandlerHook(cmdType, multiDispatchInfo);
|
|
aubCaptureHook(blocking, clearAllDependencies, multiDispatchInfo);
|
|
|
|
if (DebugManager.flags.MakeEachEnqueueBlocking.get()) {
|
|
blocking = true;
|
|
}
|
|
|
|
TimestampPacketDependencies timestampPacketDependencies;
|
|
BlitPropertiesContainer blitPropertiesContainer;
|
|
CsrDependencies csrDeps;
|
|
|
|
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, bcsCsr, CsrDependencies::DependenciesType::All);
|
|
auto allocator = bcsCsr.getTimestampPacketAllocator();
|
|
|
|
if (!blockQueue) {
|
|
setupBarrierTimestampForBcsEngines(bcsCsr.getOsContext().getEngineType(), timestampPacketDependencies);
|
|
if (isOOQEnabled()) {
|
|
TimestampPacketContainer clearBarrierNodes;
|
|
timestampPacketDependencies.barrierNodes.swapNodes(clearBarrierNodes);
|
|
}
|
|
}
|
|
processBarrierTimestampForBcsEngine(bcsCsr.getOsContext().getEngineType(), timestampPacketDependencies);
|
|
|
|
auto gpgpuSubmission = isGpgpuSubmissionForBcsRequired(blockQueue, timestampPacketDependencies);
|
|
if (isCacheFlushForBcsRequired() && gpgpuSubmission) {
|
|
timestampPacketDependencies.cacheFlushNodes.add(allocator->getTag());
|
|
}
|
|
|
|
obtainNewTimestampPacketNodes(1, timestampPacketDependencies.previousEnqueueNodes, clearAllDependencies, bcsCsr);
|
|
csrDeps.timestampPacketContainer.push_back(×tampPacketDependencies.previousEnqueueNodes);
|
|
|
|
if (eventBuilder.getEvent()) {
|
|
eventBuilder.getEvent()->addTimestampPacketNodes(*timestampPacketContainer);
|
|
}
|
|
|
|
CompletionStamp completionStamp = {CompletionStamp::notReady, taskLevel, 0};
|
|
|
|
const EnqueueProperties enqueueProperties(true, false, false, false, false, &blitPropertiesContainer);
|
|
|
|
LinearStream *gpgpuCommandStream = {};
|
|
size_t gpgpuCommandStreamStart = {};
|
|
if (gpgpuSubmission) {
|
|
commandStreamReceiverOwnership = getGpgpuCommandStreamReceiver().obtainUniqueOwnership();
|
|
gpgpuCommandStream = obtainCommandStream<cmdType>(csrDeps, true, blockQueue, multiDispatchInfo, eventsRequest, blockedCommandsData, nullptr, 0, false);
|
|
gpgpuCommandStreamStart = gpgpuCommandStream->getUsed();
|
|
}
|
|
|
|
blitPropertiesContainer.push_back(processDispatchForBlitEnqueue(bcsCsr, multiDispatchInfo, timestampPacketDependencies,
|
|
eventsRequest, gpgpuCommandStream, cmdType, blockQueue));
|
|
|
|
if (!blockQueue) {
|
|
completionStamp = enqueueCommandWithoutKernel(nullptr, 0, gpgpuCommandStream, gpgpuCommandStreamStart, blocking,
|
|
enqueueProperties, timestampPacketDependencies, eventsRequest,
|
|
eventBuilder, taskLevel, csrDeps, &bcsCsr);
|
|
if (gpgpuSubmission) {
|
|
commandStreamReceiverOwnership.unlock();
|
|
}
|
|
|
|
if (eventBuilder.getEvent()) {
|
|
eventBuilder.getEvent()->flushStamp->replaceStampObject(this->flushStamp->getStampReference());
|
|
}
|
|
|
|
this->latestSentEnqueueType = enqueueProperties.operation;
|
|
|
|
setLastBcsPacket(bcsCsr.getOsContext().getEngineType());
|
|
}
|
|
updateFromCompletionStamp(completionStamp, eventBuilder.getEvent());
|
|
|
|
if (blockQueue) {
|
|
enqueueBlocked(cmdType, nullptr, 0, multiDispatchInfo, timestampPacketDependencies, blockedCommandsData, enqueueProperties, eventsRequest, eventBuilder, nullptr, &bcsCsr);
|
|
}
|
|
|
|
timestampPacketDependencies.moveNodesToNewContainer(*deferredTimestampPackets);
|
|
|
|
queueOwnership.unlock();
|
|
bcsCommandStreamReceiverOwnership.unlock();
|
|
|
|
if (blocking) {
|
|
waitForAllEngines(blockQueue, nullptr);
|
|
}
|
|
}
|
|
|
|
template <typename GfxFamily>
|
|
template <uint32_t cmdType, size_t surfaceCount>
|
|
void CommandQueueHw<GfxFamily>::dispatchBcsOrGpgpuEnqueue(MultiDispatchInfo &dispatchInfo, Surface *(&surfaces)[surfaceCount], EBuiltInOps::Type builtInOperation, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event, bool blocking, CommandStreamReceiver &csr) {
|
|
const bool blit = EngineHelpers::isBcs(csr.getOsContext().getEngineType());
|
|
|
|
if (blit) {
|
|
enqueueBlit<cmdType>(dispatchInfo, numEventsInWaitList, eventWaitList, event, blocking, csr);
|
|
} else {
|
|
auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(builtInOperation,
|
|
this->getClDevice());
|
|
BuiltInOwnershipWrapper builtInLock(builder, this->context);
|
|
|
|
builder.buildDispatchInfos(dispatchInfo);
|
|
|
|
enqueueHandler<cmdType>(
|
|
surfaces,
|
|
blocking,
|
|
dispatchInfo,
|
|
numEventsInWaitList,
|
|
eventWaitList,
|
|
event);
|
|
}
|
|
}
|
|
|
|
template <typename GfxFamily>
|
|
bool CommandQueueHw<GfxFamily>::isBlitAuxTranslationRequired(const MultiDispatchInfo &multiDispatchInfo) {
|
|
return multiDispatchInfo.getKernelObjsForAuxTranslation() &&
|
|
(multiDispatchInfo.getKernelObjsForAuxTranslation()->size() > 0) &&
|
|
(HwHelperHw<GfxFamily>::get().getAuxTranslationMode(device->getHardwareInfo()) == AuxTranslationMode::Blit);
|
|
}
|
|
|
|
} // namespace NEO
|