compute-runtime/opencl/source/helpers/task_information.cpp

509 lines
28 KiB
C++
Raw Normal View History

/*
* Copyright (C) 2018-2021 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "opencl/source/helpers/task_information.h"
#include "shared/source/command_stream/command_stream_receiver.h"
#include "shared/source/command_stream/csr_deps.h"
#include "shared/source/command_stream/linear_stream.h"
#include "shared/source/command_stream/preemption.h"
#include "shared/source/helpers/aligned_memory.h"
#include "shared/source/helpers/engine_node_helper.h"
#include "shared/source/helpers/string.h"
#include "shared/source/memory_manager/internal_allocation_storage.h"
#include "shared/source/memory_manager/surface.h"
#include "opencl/source/built_ins/builtins_dispatch_builder.h"
#include "opencl/source/cl_device/cl_device.h"
#include "opencl/source/command_queue/command_queue.h"
#include "opencl/source/command_queue/enqueue_common.h"
#include "opencl/source/device_queue/device_queue.h"
#include "opencl/source/gtpin/gtpin_notify.h"
#include "opencl/source/helpers/cl_preemption_helper.h"
#include "opencl/source/helpers/enqueue_properties.h"
#include "opencl/source/helpers/task_information.inl"
#include "opencl/source/mem_obj/mem_obj.h"
namespace NEO {
template void KernelOperation::ResourceCleaner::operator()<LinearStream>(LinearStream *);
template void KernelOperation::ResourceCleaner::operator()<IndirectHeap>(IndirectHeap *);
CommandMapUnmap::CommandMapUnmap(MapOperationType operationType, MemObj &memObj, MemObjSizeArray &copySize, MemObjOffsetArray &copyOffset, bool readOnly,
CommandQueue &commandQueue)
: Command(commandQueue), memObj(memObj), copySize(copySize), copyOffset(copyOffset), readOnly(readOnly), operationType(operationType) {
memObj.incRefInternal();
}
CompletionStamp &CommandMapUnmap::submit(uint32_t taskLevel, bool terminated) {
if (terminated) {
memObj.decRefInternal();
return completionStamp;
}
auto &commandStreamReceiver = commandQueue.getGpgpuCommandStreamReceiver();
auto commandStreamReceiverOwnership = commandStreamReceiver.obtainUniqueOwnership();
auto &queueCommandStream = commandQueue.getCS(0);
size_t offset = queueCommandStream.getUsed();
MultiDispatchInfo multiDispatch;
Device &device = commandQueue.getDevice();
DispatchFlags dispatchFlags(
{}, //csrDependencies
nullptr, //barrierTimestampPacketNodes
{}, //pipelineSelectArgs
commandQueue.flushStamp->getStampReference(), //flushStampReference
commandQueue.getThrottle(), //throttle
ClPreemptionHelper::taskPreemptionMode(device, multiDispatch), //preemptionMode
GrfConfig::NotApplicable, //numGrfRequired
L3CachingSettings::NotApplicable, //l3CacheSettings
ThreadArbitrationPolicy::NotPresent, //threadArbitrationPolicy
AdditionalKernelExecInfo::NotApplicable, //additionalKernelExecInfo
KernelExecutionType::NotApplicable, //kernelExecutionType
MemoryCompressionState::NotApplicable, //memoryCompressionState
commandQueue.getSliceCount(), //sliceCount
true, //blocking
true, //dcFlush
false, //useSLM
true, //guardCommandBufferWithPipeControl
false, //GSBA32BitRequired
false, //requiresCoherency
commandQueue.getPriority() == QueuePriority::LOW, //lowPriority
false, //implicitFlush
commandQueue.getGpgpuCommandStreamReceiver().isNTo1SubmissionModelEnabled(), //outOfOrderExecutionAllowed
false, //epilogueRequired
false, //usePerDssBackedBuffer
false, //useSingleSubdevice
false, //useGlobalAtomics
false, //areMultipleSubDevicesInContext
false, //memoryMigrationRequired
false); //textureCacheFlush
DEBUG_BREAK_IF(taskLevel >= CompletionStamp::notReady);
gtpinNotifyPreFlushTask(&commandQueue);
completionStamp = commandStreamReceiver.flushTask(queueCommandStream,
offset,
commandQueue.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 0u),
commandQueue.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 0u),
commandQueue.getIndirectHeap(IndirectHeap::SURFACE_STATE, 0u),
taskLevel,
dispatchFlags,
commandQueue.getDevice());
commandQueue.updateLatestSentEnqueueType(EnqueueProperties::Operation::DependencyResolveOnGpu);
if (!memObj.isMemObjZeroCopy()) {
commandQueue.waitUntilComplete(completionStamp.taskCount, {}, completionStamp.flushStamp, false);
if (operationType == MAP) {
memObj.transferDataToHostPtr(copySize, copyOffset);
} else if (!readOnly) {
DEBUG_BREAK_IF(operationType != UNMAP);
memObj.transferDataFromHostPtr(copySize, copyOffset);
}
}
memObj.decRefInternal();
return completionStamp;
}
CommandComputeKernel::CommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr<KernelOperation> &kernelOperation, std::vector<Surface *> &surfaces,
bool flushDC, bool usesSLM, bool ndRangeKernel, std::unique_ptr<PrintfHandler> &&printfHandler,
PreemptionMode preemptionMode, Kernel *kernel, uint32_t kernelCount)
: Command(commandQueue, kernelOperation), flushDC(flushDC), slmUsed(usesSLM),
NDRangeKernel(ndRangeKernel), printfHandler(std::move(printfHandler)), kernel(kernel),
kernelCount(kernelCount), preemptionMode(preemptionMode) {
for (auto surface : surfaces) {
this->surfaces.push_back(surface);
}
UNRECOVERABLE_IF(nullptr == this->kernel);
kernel->incRefInternal();
}
CommandComputeKernel::~CommandComputeKernel() {
kernel->decRefInternal();
}
CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminated) {
if (terminated) {
for (auto surface : surfaces) {
delete surface;
}
surfaces.clear();
return completionStamp;
}
auto &commandStreamReceiver = commandQueue.getGpgpuCommandStreamReceiver();
bool executionModelKernel = kernel->isParentKernel;
auto devQueue = commandQueue.getContext().getDefaultDeviceQueue();
auto bcsCsrForAuxTranslation = commandQueue.getBcsForAuxTranslation();
auto commandStreamReceiverOwnership = commandStreamReceiver.obtainUniqueOwnership();
bool isCcsUsed = EngineHelpers::isCcs(commandQueue.getGpgpuEngine().osContext->getEngineType());
if (executionModelKernel) {
while (!devQueue->isEMCriticalSectionFree())
;
devQueue->resetDeviceQueue();
devQueue->acquireEMCriticalSection();
}
IndirectHeap *dsh = kernelOperation->dsh.get();
IndirectHeap *ioh = kernelOperation->ioh.get();
IndirectHeap *ssh = kernelOperation->ssh.get();
auto requiresCoherency = false;
auto anyUncacheableArgs = false;
for (auto &surface : surfaces) {
DEBUG_BREAK_IF(!surface);
surface->makeResident(commandStreamReceiver);
requiresCoherency |= surface->IsCoherent;
if (!surface->allowsL3Caching()) {
anyUncacheableArgs = true;
}
}
if (printfHandler) {
printfHandler.get()->makeResident(commandStreamReceiver);
}
makeTimestampPacketsResident(commandStreamReceiver);
if (executionModelKernel) {
uint32_t taskCount = commandStreamReceiver.peekTaskCount() + 1;
devQueue->setupExecutionModelDispatch(*ssh, *dsh, kernel, kernelCount,
commandStreamReceiver.getTagAllocation()->getGpuAddress(), taskCount, timestamp, isCcsUsed);
SchedulerKernel &scheduler = commandQueue.getContext().getSchedulerKernel();
scheduler.setArgs(devQueue->getQueueBuffer(),
devQueue->getStackBuffer(),
devQueue->getEventPoolBuffer(),
devQueue->getSlbBuffer(),
dsh->getGraphicsAllocation(),
kernel->getKernelReflectionSurface(),
devQueue->getQueueStorageBuffer(),
ssh->getGraphicsAllocation(),
devQueue->getDebugQueue());
devQueue->dispatchScheduler(
*kernelOperation->commandStream,
scheduler,
preemptionMode,
ssh,
dsh,
isCcsUsed);
scheduler.makeResident(commandStreamReceiver);
// Update SLM usage
slmUsed |= scheduler.getSlmTotalSize() > 0;
this->kernel->getProgram()->getBlockKernelManager()->makeInternalAllocationsResident(commandStreamReceiver);
}
if (kernelOperation->blitPropertiesContainer.size() > 0) {
CsrDependencies csrDeps;
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, *bcsCsrForAuxTranslation, CsrDependencies::DependenciesType::All);
BlitProperties::setupDependenciesForAuxTranslation(kernelOperation->blitPropertiesContainer, *timestampPacketDependencies,
*currentTimestampPacketNodes, csrDeps,
commandQueue.getGpgpuCommandStreamReceiver(), *bcsCsrForAuxTranslation);
}
if (timestampPacketDependencies && commandQueue.isOOQEnabled()) {
commandQueue.setupBarrierTimestampForBcsEngines(commandQueue.getGpgpuCommandStreamReceiver().getOsContext().getEngineType(), *timestampPacketDependencies);
}
const auto &kernelDescriptor = kernel->getKernelInfo().kernelDescriptor;
auto memoryCompressionState = commandStreamReceiver.getMemoryCompressionState(kernel->isAuxTranslationRequired(), commandQueue.getDevice().getHardwareInfo());
DispatchFlags dispatchFlags(
{}, //csrDependencies
nullptr, //barrierTimestampPacketNodes
{false, kernel->isVmeKernel()}, //pipelineSelectArgs
commandQueue.flushStamp->getStampReference(), //flushStampReference
commandQueue.getThrottle(), //throttle
preemptionMode, //preemptionMode
kernelDescriptor.kernelAttributes.numGrfRequired, //numGrfRequired
L3CachingSettings::l3CacheOn, //l3CacheSettings
kernel->getThreadArbitrationPolicy(), //threadArbitrationPolicy
kernel->getAdditionalKernelExecInfo(), //additionalKernelExecInfo
kernel->getExecutionType(), //kernelExecutionType
memoryCompressionState, //memoryCompressionState
commandQueue.getSliceCount(), //sliceCount
true, //blocking
flushDC, //dcFlush
slmUsed, //useSLM
true, //guardCommandBufferWithPipeControl
NDRangeKernel, //GSBA32BitRequired
requiresCoherency, //requiresCoherency
commandQueue.getPriority() == QueuePriority::LOW, //lowPriority
false, //implicitFlush
commandQueue.getGpgpuCommandStreamReceiver().isNTo1SubmissionModelEnabled(), //outOfOrderExecutionAllowed
false, //epilogueRequired
kernel->requiresPerDssBackedBuffer(), //usePerDssBackedBuffer
kernel->isSingleSubdevicePreferred(), //useSingleSubdevice
kernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, //useGlobalAtomics
kernel->areMultipleSubDevicesInContext(), //areMultipleSubDevicesInContext
kernel->requiresMemoryMigration(), //memoryMigrationRequired
false); //textureCacheFlush
if (commandQueue.getContext().getRootDeviceIndices().size() > 1) {
eventsRequest.fillCsrDependenciesForTaskCountContainer(dispatchFlags.csrDependencies, commandStreamReceiver);
}
const bool isHandlingBarrier = commandQueue.getGpgpuCommandStreamReceiver().isStallingCommandsOnNextFlushRequired();
if (timestampPacketDependencies) {
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(dispatchFlags.csrDependencies, commandStreamReceiver, CsrDependencies::DependenciesType::OutOfCsr);
if (isHandlingBarrier) {
commandQueue.fillCsrDependenciesWithLastBcsPackets(dispatchFlags.csrDependencies);
}
dispatchFlags.barrierTimestampPacketNodes = &timestampPacketDependencies->barrierNodes;
}
dispatchFlags.pipelineSelectArgs.specialPipelineSelectMode = kernel->requiresSpecialPipelineSelectMode();
if (anyUncacheableArgs) {
dispatchFlags.l3CacheSettings = L3CachingSettings::l3CacheOff;
} else if (!kernel->areStatelessWritesUsed()) {
dispatchFlags.l3CacheSettings = L3CachingSettings::l3AndL1On;
}
if (commandQueue.dispatchHints != 0) {
dispatchFlags.engineHints = commandQueue.dispatchHints;
dispatchFlags.epilogueRequired = true;
}
DEBUG_BREAK_IF(taskLevel >= CompletionStamp::notReady);
gtpinNotifyPreFlushTask(&commandQueue);
if (kernel->requiresMemoryMigration()) {
for (auto &arg : kernel->getMemObjectsToMigrate()) {
MigrationController::handleMigration(commandQueue.getContext(), commandStreamReceiver, arg.second);
}
}
completionStamp = commandStreamReceiver.flushTask(*kernelOperation->commandStream,
0,
*dsh,
*ioh,
*ssh,
taskLevel,
dispatchFlags,
commandQueue.getDevice());
if (isHandlingBarrier) {
commandQueue.clearLastBcsPackets();
}
if (kernelOperation->blitPropertiesContainer.size() > 0) {
const auto newTaskCount = bcsCsrForAuxTranslation->blitBuffer(kernelOperation->blitPropertiesContainer, false, commandQueue.isProfilingEnabled(), commandQueue.getDevice());
commandQueue.updateBcsTaskCount(bcsCsrForAuxTranslation->getOsContext().getEngineType(), newTaskCount);
}
commandQueue.updateLatestSentEnqueueType(EnqueueProperties::Operation::GpuKernel);
if (gtpinIsGTPinInitialized()) {
gtpinNotifyFlushTask(completionStamp.taskCount);
}
if (printfHandler) {
commandQueue.waitUntilComplete(completionStamp.taskCount, {}, completionStamp.flushStamp, false);
printfHandler.get()->printEnqueueOutput();
}
for (auto surface : surfaces) {
delete surface;
}
surfaces.clear();
return completionStamp;
}
void CommandWithoutKernel::dispatchBlitOperation() {
auto bcsCsr = kernelOperation->bcsCsr;
UNRECOVERABLE_IF(bcsCsr == nullptr);
UNRECOVERABLE_IF(kernelOperation->blitPropertiesContainer.size() != 1);
auto &blitProperties = *kernelOperation->blitPropertiesContainer.begin();
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(blitProperties.csrDependencies, *bcsCsr, CsrDependencies::DependenciesType::All);
blitProperties.csrDependencies.timestampPacketContainer.push_back(&timestampPacketDependencies->cacheFlushNodes);
blitProperties.csrDependencies.timestampPacketContainer.push_back(&timestampPacketDependencies->previousEnqueueNodes);
blitProperties.csrDependencies.timestampPacketContainer.push_back(&timestampPacketDependencies->barrierNodes);
blitProperties.outputTimestampPacket = currentTimestampPacketNodes->peekNodes()[0];
if (commandQueue.getContext().getRootDeviceIndices().size() > 1) {
eventsRequest.fillCsrDependenciesForTaskCountContainer(blitProperties.csrDependencies, *bcsCsr);
}
const auto newTaskCount = bcsCsr->blitBuffer(kernelOperation->blitPropertiesContainer, false, commandQueue.isProfilingEnabled(), commandQueue.getDevice());
commandQueue.updateBcsTaskCount(bcsCsr->getOsContext().getEngineType(), newTaskCount);
commandQueue.setLastBcsPacket(bcsCsr->getOsContext().getEngineType());
}
CompletionStamp &CommandWithoutKernel::submit(uint32_t taskLevel, bool terminated) {
if (terminated) {
return completionStamp;
}
auto &commandStreamReceiver = commandQueue.getGpgpuCommandStreamReceiver();
if (!kernelOperation) {
completionStamp.taskCount = commandStreamReceiver.peekTaskCount();
completionStamp.taskLevel = commandStreamReceiver.peekTaskLevel();
completionStamp.flushStamp = commandStreamReceiver.obtainCurrentFlushStamp();
return completionStamp;
}
auto barrierNodes = timestampPacketDependencies ? &timestampPacketDependencies->barrierNodes : nullptr;
auto lockCSR = commandStreamReceiver.obtainUniqueOwnership();
auto enqueueOperationType = EnqueueProperties::Operation::DependencyResolveOnGpu;
if (kernelOperation->blitEnqueue) {
enqueueOperationType = EnqueueProperties::Operation::Blit;
UNRECOVERABLE_IF(!barrierNodes);
if (commandStreamReceiver.isStallingCommandsOnNextFlushRequired()) {
barrierNodes->add(commandStreamReceiver.getTimestampPacketAllocator()->getTag());
}
}
if (timestampPacketDependencies && commandQueue.isOOQEnabled()) {
commandQueue.setupBarrierTimestampForBcsEngines(commandQueue.getGpgpuCommandStreamReceiver().getOsContext().getEngineType(), *timestampPacketDependencies);
}
auto rootDeviceIndex = commandStreamReceiver.getRootDeviceIndex();
DispatchFlags dispatchFlags(
{}, //csrDependencies
barrierNodes, //barrierTimestampPacketNodes
{}, //pipelineSelectArgs
commandQueue.flushStamp->getStampReference(), //flushStampReference
commandQueue.getThrottle(), //throttle
commandQueue.getDevice().getPreemptionMode(), //preemptionMode
GrfConfig::NotApplicable, //numGrfRequired
L3CachingSettings::NotApplicable, //l3CacheSettings
ThreadArbitrationPolicy::NotPresent, //threadArbitrationPolicy
AdditionalKernelExecInfo::NotApplicable, //additionalKernelExecInfo
KernelExecutionType::NotApplicable, //kernelExecutionType
MemoryCompressionState::NotApplicable, //memoryCompressionState
commandQueue.getSliceCount(), //sliceCount
true, //blocking
false, //dcFlush
false, //useSLM
true, //guardCommandBufferWithPipeControl
false, //GSBA32BitRequired
false, //requiresCoherency
commandQueue.getPriority() == QueuePriority::LOW, //lowPriority
false, //implicitFlush
commandStreamReceiver.isNTo1SubmissionModelEnabled(), //outOfOrderExecutionAllowed
false, //epilogueRequired
false, //usePerDssBackedBuffer
false, //useSingleSubdevice
false, //useGlobalAtomics
commandQueue.getContext().containsMultipleSubDevices(rootDeviceIndex), //areMultipleSubDevicesInContext
false, //memoryMigrationRequired
false); //textureCacheFlush
if (commandQueue.getContext().getRootDeviceIndices().size() > 1) {
eventsRequest.fillCsrDependenciesForTaskCountContainer(dispatchFlags.csrDependencies, commandStreamReceiver);
}
const bool isHandlingBarrier = commandQueue.getGpgpuCommandStreamReceiver().isStallingCommandsOnNextFlushRequired();
if (commandStreamReceiver.peekTimestampPacketWriteEnabled()) {
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(dispatchFlags.csrDependencies, commandStreamReceiver, CsrDependencies::DependenciesType::OutOfCsr);
if (isHandlingBarrier) {
commandQueue.fillCsrDependenciesWithLastBcsPackets(dispatchFlags.csrDependencies);
}
makeTimestampPacketsResident(commandStreamReceiver);
}
gtpinNotifyPreFlushTask(&commandQueue);
completionStamp = commandStreamReceiver.flushTask(*kernelOperation->commandStream,
0,
commandQueue.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 0u),
commandQueue.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 0u),
commandQueue.getIndirectHeap(IndirectHeap::SURFACE_STATE, 0u),
taskLevel,
dispatchFlags,
commandQueue.getDevice());
if (isHandlingBarrier) {
commandQueue.clearLastBcsPackets();
}
if (kernelOperation->blitEnqueue) {
dispatchBlitOperation();
}
commandQueue.updateLatestSentEnqueueType(enqueueOperationType);
return completionStamp;
}
void Command::setEventsRequest(EventsRequest &eventsRequest) {
this->eventsRequest = eventsRequest;
if (eventsRequest.numEventsInWaitList > 0) {
eventsWaitlist.resize(eventsRequest.numEventsInWaitList);
auto size = eventsRequest.numEventsInWaitList * sizeof(cl_event);
memcpy_s(&eventsWaitlist[0], size, eventsRequest.eventWaitList, size);
this->eventsRequest.eventWaitList = &eventsWaitlist[0];
for (cl_uint i = 0; i < eventsRequest.numEventsInWaitList; i++) {
auto event = castToObjectOrAbort<Event>(eventsRequest.eventWaitList[i]);
event->incRefInternal();
}
}
}
void Command::setTimestampPacketNode(TimestampPacketContainer &current, TimestampPacketDependencies &&dependencies) {
currentTimestampPacketNodes = std::make_unique<TimestampPacketContainer>();
currentTimestampPacketNodes->assignAndIncrementNodesRefCounts(current);
timestampPacketDependencies = std::make_unique<TimestampPacketDependencies>();
*timestampPacketDependencies = std::move(dependencies);
}
Command::~Command() {
if (commandQueue.getDeferredTimestampPackets() && timestampPacketDependencies.get()) {
timestampPacketDependencies->moveNodesToNewContainer(*commandQueue.getDeferredTimestampPackets());
}
for (cl_event &eventFromWaitList : eventsWaitlist) {
auto event = castToObjectOrAbort<Event>(eventFromWaitList);
event->decRefInternal();
}
}
void Command::makeTimestampPacketsResident(CommandStreamReceiver &commandStreamReceiver) {
if (commandStreamReceiver.peekTimestampPacketWriteEnabled()) {
for (cl_event &eventFromWaitList : eventsWaitlist) {
auto event = castToObjectOrAbort<Event>(eventFromWaitList);
if (event->getTimestampPacketNodes() && event->getCommandQueue()->getClDevice().getRootDeviceIndex() == commandStreamReceiver.getRootDeviceIndex()) {
event->getTimestampPacketNodes()->makeResident(commandStreamReceiver);
}
}
}
if (currentTimestampPacketNodes) {
currentTimestampPacketNodes->makeResident(commandStreamReceiver);
}
if (timestampPacketDependencies) {
timestampPacketDependencies->cacheFlushNodes.makeResident(commandStreamReceiver);
timestampPacketDependencies->previousEnqueueNodes.makeResident(commandStreamReceiver);
}
}
Command::Command(CommandQueue &commandQueue) : commandQueue(commandQueue) {}
Command::Command(CommandQueue &commandQueue, std::unique_ptr<KernelOperation> &kernelOperation)
: commandQueue(commandQueue), kernelOperation(std::move(kernelOperation)) {}
} // namespace NEO