/* * Copyright (C) 2018-2023 Intel Corporation * * SPDX-License-Identifier: MIT * */ #include "opencl/source/helpers/task_information.h" #include "shared/source/command_stream/command_stream_receiver.h" #include "shared/source/command_stream/csr_deps.h" #include "shared/source/command_stream/linear_stream.h" #include "shared/source/command_stream/preemption.h" #include "shared/source/command_stream/wait_status.h" #include "shared/source/helpers/blit_properties.h" #include "shared/source/helpers/flush_stamp.h" #include "shared/source/memory_manager/internal_allocation_storage.h" #include "shared/source/memory_manager/surface.h" #include "opencl/source/built_ins/builtins_dispatch_builder.h" #include "opencl/source/cl_device/cl_device.h" #include "opencl/source/command_queue/command_queue.h" #include "opencl/source/command_queue/enqueue_common.h" #include "opencl/source/gtpin/gtpin_notify.h" #include "opencl/source/helpers/cl_preemption_helper.h" #include "opencl/source/helpers/enqueue_properties.h" #include "opencl/source/helpers/task_information.inl" #include "opencl/source/mem_obj/mem_obj.h" namespace NEO { template void KernelOperation::ResourceCleaner::operator()(LinearStream *); template void KernelOperation::ResourceCleaner::operator()(IndirectHeap *); CommandMapUnmap::CommandMapUnmap(MapOperationType operationType, MemObj &memObj, MemObjSizeArray ©Size, MemObjOffsetArray ©Offset, bool readOnly, CommandQueue &commandQueue) : Command(commandQueue), memObj(memObj), copySize(copySize), copyOffset(copyOffset), readOnly(readOnly), operationType(operationType) { memObj.incRefInternal(); } CompletionStamp &CommandMapUnmap::submit(TaskCountType taskLevel, bool terminated) { DecRefInternalAtScopeEnd decRefInternalAtScopeEnd{memObj}; if (terminated) { this->terminated = true; return completionStamp; } auto &commandStreamReceiver = commandQueue.getGpgpuCommandStreamReceiver(); auto commandStreamReceiverOwnership = commandStreamReceiver.obtainUniqueOwnership(); auto &queueCommandStream = commandQueue.getCS(0); size_t offset = queueCommandStream.getUsed(); MultiDispatchInfo multiDispatch; Device &device = commandQueue.getDevice(); DispatchFlags dispatchFlags( {}, // csrDependencies nullptr, // barrierTimestampPacketNodes {}, // pipelineSelectArgs commandQueue.flushStamp->getStampReference(), // flushStampReference commandQueue.getThrottle(), // throttle ClPreemptionHelper::taskPreemptionMode(device, multiDispatch), // preemptionMode GrfConfig::NotApplicable, // numGrfRequired L3CachingSettings::NotApplicable, // l3CacheSettings ThreadArbitrationPolicy::NotPresent, // threadArbitrationPolicy AdditionalKernelExecInfo::NotApplicable, // additionalKernelExecInfo KernelExecutionType::NotApplicable, // kernelExecutionType MemoryCompressionState::NotApplicable, // memoryCompressionState commandQueue.getSliceCount(), // sliceCount true, // blocking true, // dcFlush false, // useSLM !commandQueue.getGpgpuCommandStreamReceiver().isUpdateTagFromWaitEnabled(), // guardCommandBufferWithPipeControl false, // GSBA32BitRequired false, // requiresCoherency commandQueue.getPriority() == QueuePriority::LOW, // lowPriority false, // implicitFlush commandQueue.getGpgpuCommandStreamReceiver().isNTo1SubmissionModelEnabled(), // outOfOrderExecutionAllowed false, // epilogueRequired false, // usePerDssBackedBuffer false, // useGlobalAtomics false, // areMultipleSubDevicesInContext false, // memoryMigrationRequired false, // textureCacheFlush false, // hasStallingCmds false, // hasRelaxedOrderingDependencies false, // stateCacheInvalidation commandQueue.isStallingCommandsOnNextFlushRequired()); // isStallingCommandsOnNextFlushRequired DEBUG_BREAK_IF(taskLevel >= CompletionStamp::notReady); gtpinNotifyPreFlushTask(&commandQueue); completionStamp = commandStreamReceiver.flushTask(queueCommandStream, offset, &commandQueue.getIndirectHeap(IndirectHeap::Type::DYNAMIC_STATE, 0u), &commandQueue.getIndirectHeap(IndirectHeap::Type::INDIRECT_OBJECT, 0u), &commandQueue.getIndirectHeap(IndirectHeap::Type::SURFACE_STATE, 0u), taskLevel, dispatchFlags, commandQueue.getDevice()); commandQueue.updateLatestSentEnqueueType(EnqueueProperties::Operation::DependencyResolveOnGpu); if (!memObj.isMemObjZeroCopy()) { const auto waitStatus = commandQueue.waitUntilComplete(completionStamp.taskCount, {}, completionStamp.flushStamp, false); if (waitStatus == WaitStatus::GpuHang) { completionStamp.taskCount = CompletionStamp::gpuHang; return completionStamp; } if (operationType == MAP) { memObj.transferDataToHostPtr(copySize, copyOffset); } else if (!readOnly) { DEBUG_BREAK_IF(operationType != UNMAP); memObj.transferDataFromHostPtr(copySize, copyOffset); } } return completionStamp; } CommandComputeKernel::CommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr &kernelOperation, std::vector surfaces, bool flushDC, bool usesSLM, uint32_t commandType, std::unique_ptr &&printfHandler, PreemptionMode preemptionMode, Kernel *kernel, uint32_t kernelCount, TagNodeBase *multiRootDeviceSyncNode) : Command(commandQueue, kernelOperation), surfaces(std::move(surfaces)), flushDC(flushDC), slmUsed(usesSLM), commandType(commandType), printfHandler(std::move(printfHandler)), kernel(kernel), kernelCount(kernelCount), preemptionMode(preemptionMode), multiRootDeviceSyncNode(multiRootDeviceSyncNode) { UNRECOVERABLE_IF(nullptr == this->kernel); kernel->incRefInternal(); } CommandComputeKernel::~CommandComputeKernel() { kernel->decRefInternal(); } CompletionStamp &CommandComputeKernel::submit(TaskCountType taskLevel, bool terminated) { if (terminated) { this->terminated = true; for (auto surface : surfaces) { delete surface; } surfaces.clear(); return completionStamp; } auto &commandStreamReceiver = commandQueue.getGpgpuCommandStreamReceiver(); auto bcsCsrForAuxTranslation = commandQueue.getBcsForAuxTranslation(); auto commandStreamReceiverOwnership = commandStreamReceiver.obtainUniqueOwnership(); IndirectHeap *dsh = kernelOperation->dsh.get(); IndirectHeap *ioh = kernelOperation->ioh.get(); IndirectHeap *ssh = kernelOperation->ssh.get(); auto requiresCoherency = false; auto anyUncacheableArgs = false; for (auto &surface : surfaces) { DEBUG_BREAK_IF(!surface); surface->makeResident(commandStreamReceiver); requiresCoherency |= surface->IsCoherent; if (!surface->allowsL3Caching()) { anyUncacheableArgs = true; } } if (printfHandler) { printfHandler->makeResident(commandStreamReceiver); } makeTimestampPacketsResident(commandStreamReceiver); if (multiRootDeviceSyncNode != nullptr) { commandStreamReceiver.makeResident(*multiRootDeviceSyncNode->getBaseGraphicsAllocation()); } if (kernelOperation->blitPropertiesContainer.size() > 0) { CsrDependencies csrDeps; eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, *bcsCsrForAuxTranslation, CsrDependencies::DependenciesType::All); BlitProperties::setupDependenciesForAuxTranslation(kernelOperation->blitPropertiesContainer, *timestampPacketDependencies, *currentTimestampPacketNodes, csrDeps, commandQueue.getGpgpuCommandStreamReceiver(), *bcsCsrForAuxTranslation); commandQueue.setStallingCommandsOnNextFlush(true); } if (timestampPacketDependencies && commandQueue.isOOQEnabled()) { commandQueue.setupBarrierTimestampForBcsEngines(commandQueue.getGpgpuCommandStreamReceiver().getOsContext().getEngineType(), *timestampPacketDependencies); } const auto &kernelDescriptor = kernel->getKernelInfo().kernelDescriptor; auto memoryCompressionState = commandStreamReceiver.getMemoryCompressionState(kernel->isAuxTranslationRequired()); DispatchFlags dispatchFlags( {}, // csrDependencies nullptr, // barrierTimestampPacketNodes {false, kernel->isVmeKernel()}, // pipelineSelectArgs commandQueue.flushStamp->getStampReference(), // flushStampReference commandQueue.getThrottle(), // throttle preemptionMode, // preemptionMode kernelDescriptor.kernelAttributes.numGrfRequired, // numGrfRequired L3CachingSettings::l3CacheOn, // l3CacheSettings kernelDescriptor.kernelAttributes.threadArbitrationPolicy, // threadArbitrationPolicy kernel->getAdditionalKernelExecInfo(), // additionalKernelExecInfo kernel->getExecutionType(), // kernelExecutionType memoryCompressionState, // memoryCompressionState commandQueue.getSliceCount(), // sliceCount true, // blocking flushDC, // dcFlush slmUsed, // useSLM !commandQueue.getGpgpuCommandStreamReceiver().isUpdateTagFromWaitEnabled(), // guardCommandBufferWithPipeControl commandType == CL_COMMAND_NDRANGE_KERNEL, // GSBA32BitRequired requiresCoherency, // requiresCoherency commandQueue.getPriority() == QueuePriority::LOW, // lowPriority false, // implicitFlush commandQueue.getGpgpuCommandStreamReceiver().isNTo1SubmissionModelEnabled(), // outOfOrderExecutionAllowed false, // epilogueRequired false, // usePerDssBackedBuffer kernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, // useGlobalAtomics kernel->areMultipleSubDevicesInContext(), // areMultipleSubDevicesInContext kernel->requiresMemoryMigration(), // memoryMigrationRequired commandQueue.isTextureCacheFlushNeeded(this->commandType), // textureCacheFlush false, // hasStallingCmds false, // hasRelaxedOrderingDependencies false, // stateCacheInvalidation commandQueue.isStallingCommandsOnNextFlushRequired()); // isStallingCommandsOnNextFlushRequired if (commandQueue.getContext().getRootDeviceIndices().size() > 1) { eventsRequest.fillCsrDependenciesForRootDevices(dispatchFlags.csrDependencies, commandStreamReceiver); } const bool isHandlingBarrier = commandQueue.isStallingCommandsOnNextFlushRequired(); if (timestampPacketDependencies) { if (isHandlingBarrier) { commandQueue.fillCsrDependenciesWithLastBcsPackets(dispatchFlags.csrDependencies); } dispatchFlags.barrierTimestampPacketNodes = ×tampPacketDependencies->barrierNodes; } dispatchFlags.pipelineSelectArgs.systolicPipelineSelectMode = kernel->requiresSystolicPipelineSelectMode(); if (anyUncacheableArgs) { dispatchFlags.l3CacheSettings = L3CachingSettings::l3CacheOff; } else if (!kernel->areStatelessWritesUsed()) { dispatchFlags.l3CacheSettings = L3CachingSettings::l3AndL1On; } if (commandQueue.dispatchHints != 0) { dispatchFlags.engineHints = commandQueue.dispatchHints; dispatchFlags.epilogueRequired = true; } DEBUG_BREAK_IF(taskLevel >= CompletionStamp::notReady); gtpinNotifyPreFlushTask(&commandQueue); if (kernel->requiresMemoryMigration()) { for (auto &arg : kernel->getMemObjectsToMigrate()) { MigrationController::handleMigration(commandQueue.getContext(), commandStreamReceiver, arg.second); } } completionStamp = commandStreamReceiver.flushTask(*kernelOperation->commandStream, 0, dsh, ioh, ssh, taskLevel, dispatchFlags, commandQueue.getDevice()); if (isHandlingBarrier) { commandQueue.clearLastBcsPackets(); commandQueue.setStallingCommandsOnNextFlush(false); } if (kernelOperation->blitPropertiesContainer.size() > 0) { const auto newTaskCount = bcsCsrForAuxTranslation->flushBcsTask(kernelOperation->blitPropertiesContainer, false, commandQueue.isProfilingEnabled(), commandQueue.getDevice()); if (newTaskCount <= CompletionStamp::notReady) { commandQueue.updateBcsTaskCount(bcsCsrForAuxTranslation->getOsContext().getEngineType(), newTaskCount); } else { completionStamp.taskCount = newTaskCount; } } commandQueue.updateLatestSentEnqueueType(EnqueueProperties::Operation::GpuKernel); if (gtpinIsGTPinInitialized()) { gtpinNotifyFlushTask(completionStamp.taskCount); } if (printfHandler) { const auto waitStatus = commandQueue.waitUntilComplete(completionStamp.taskCount, {}, completionStamp.flushStamp, false); if (waitStatus == WaitStatus::GpuHang) { completionStamp.taskCount = CompletionStamp::gpuHang; } if (!printfHandler->printEnqueueOutput()) { completionStamp.taskCount = CompletionStamp::gpuHang; } } for (auto surface : surfaces) { delete surface; } surfaces.clear(); return completionStamp; } TaskCountType CommandWithoutKernel::dispatchBlitOperation() { auto bcsCsr = kernelOperation->bcsCsr; UNRECOVERABLE_IF(bcsCsr == nullptr); UNRECOVERABLE_IF(kernelOperation->blitPropertiesContainer.size() != 1); auto &blitProperties = *kernelOperation->blitPropertiesContainer.begin(); eventsRequest.fillCsrDependenciesForTimestampPacketContainer(blitProperties.csrDependencies, *bcsCsr, CsrDependencies::DependenciesType::All); blitProperties.csrDependencies.timestampPacketContainer.push_back(×tampPacketDependencies->cacheFlushNodes); blitProperties.csrDependencies.timestampPacketContainer.push_back(×tampPacketDependencies->previousEnqueueNodes); blitProperties.csrDependencies.timestampPacketContainer.push_back(×tampPacketDependencies->barrierNodes); blitProperties.outputTimestampPacket = currentTimestampPacketNodes->peekNodes()[0]; if (commandQueue.getContext().getRootDeviceIndices().size() > 1) { eventsRequest.fillCsrDependenciesForRootDevices(blitProperties.csrDependencies, *bcsCsr); } const auto newTaskCount = bcsCsr->flushBcsTask(kernelOperation->blitPropertiesContainer, false, commandQueue.isProfilingEnabled(), commandQueue.getDevice()); if (newTaskCount > CompletionStamp::notReady) { return newTaskCount; } commandQueue.updateBcsTaskCount(bcsCsr->getOsContext().getEngineType(), newTaskCount); commandQueue.setLastBcsPacket(bcsCsr->getOsContext().getEngineType()); return newTaskCount; } CompletionStamp &CommandWithoutKernel::submit(TaskCountType taskLevel, bool terminated) { if (terminated) { this->terminated = true; return completionStamp; } auto &commandStreamReceiver = commandQueue.getGpgpuCommandStreamReceiver(); if (!kernelOperation) { completionStamp.taskCount = commandStreamReceiver.peekTaskCount(); completionStamp.taskLevel = commandStreamReceiver.peekTaskLevel(); completionStamp.flushStamp = commandStreamReceiver.obtainCurrentFlushStamp(); return completionStamp; } auto barrierNodes = timestampPacketDependencies ? ×tampPacketDependencies->barrierNodes : nullptr; auto lockCSR = commandStreamReceiver.obtainUniqueOwnership(); auto enqueueOperationType = EnqueueProperties::Operation::DependencyResolveOnGpu; if (kernelOperation->blitEnqueue) { enqueueOperationType = EnqueueProperties::Operation::Blit; UNRECOVERABLE_IF(!barrierNodes); if (commandQueue.isStallingCommandsOnNextFlushRequired()) { barrierNodes->add(commandStreamReceiver.getTimestampPacketAllocator()->getTag()); } } if (timestampPacketDependencies && commandQueue.isOOQEnabled()) { commandQueue.setupBarrierTimestampForBcsEngines(commandQueue.getGpgpuCommandStreamReceiver().getOsContext().getEngineType(), *timestampPacketDependencies); } auto rootDeviceIndex = commandStreamReceiver.getRootDeviceIndex(); DispatchFlags dispatchFlags( {}, // csrDependencies barrierNodes, // barrierTimestampPacketNodes {}, // pipelineSelectArgs commandQueue.flushStamp->getStampReference(), // flushStampReference commandQueue.getThrottle(), // throttle commandQueue.getDevice().getPreemptionMode(), // preemptionMode GrfConfig::NotApplicable, // numGrfRequired L3CachingSettings::NotApplicable, // l3CacheSettings ThreadArbitrationPolicy::NotPresent, // threadArbitrationPolicy AdditionalKernelExecInfo::NotApplicable, // additionalKernelExecInfo KernelExecutionType::NotApplicable, // kernelExecutionType MemoryCompressionState::NotApplicable, // memoryCompressionState commandQueue.getSliceCount(), // sliceCount true, // blocking false, // dcFlush false, // useSLM !commandStreamReceiver.isUpdateTagFromWaitEnabled(), // guardCommandBufferWithPipeControl false, // GSBA32BitRequired false, // requiresCoherency commandQueue.getPriority() == QueuePriority::LOW, // lowPriority false, // implicitFlush commandStreamReceiver.isNTo1SubmissionModelEnabled(), // outOfOrderExecutionAllowed false, // epilogueRequired false, // usePerDssBackedBuffer false, // useGlobalAtomics commandQueue.getContext().containsMultipleSubDevices(rootDeviceIndex), // areMultipleSubDevicesInContext false, // memoryMigrationRequired false, // textureCacheFlush false, // hasStallingCmds false, // hasRelaxedOrderingDependencies false, // stateCacheInvalidation commandQueue.isStallingCommandsOnNextFlushRequired()); // isStallingCommandsOnNextFlushRequired if (commandQueue.getContext().getRootDeviceIndices().size() > 1) { eventsRequest.fillCsrDependenciesForRootDevices(dispatchFlags.csrDependencies, commandStreamReceiver); } const bool isHandlingBarrier = commandQueue.isStallingCommandsOnNextFlushRequired(); if (commandStreamReceiver.peekTimestampPacketWriteEnabled()) { if (isHandlingBarrier) { commandQueue.fillCsrDependenciesWithLastBcsPackets(dispatchFlags.csrDependencies); } makeTimestampPacketsResident(commandStreamReceiver); } gtpinNotifyPreFlushTask(&commandQueue); completionStamp = commandStreamReceiver.flushTask(*kernelOperation->commandStream, 0, &commandQueue.getIndirectHeap(IndirectHeap::Type::DYNAMIC_STATE, 0u), &commandQueue.getIndirectHeap(IndirectHeap::Type::INDIRECT_OBJECT, 0u), &commandQueue.getIndirectHeap(IndirectHeap::Type::SURFACE_STATE, 0u), taskLevel, dispatchFlags, commandQueue.getDevice()); if (isHandlingBarrier) { commandQueue.clearLastBcsPackets(); commandQueue.setStallingCommandsOnNextFlush(false); } if (kernelOperation->blitEnqueue) { auto taskCount = dispatchBlitOperation(); if (taskCount > CompletionStamp::notReady) { completionStamp.taskCount = taskCount; } } commandQueue.updateLatestSentEnqueueType(enqueueOperationType); return completionStamp; } void Command::setEventsRequest(EventsRequest &eventsRequest) { this->eventsRequest = eventsRequest; if (eventsRequest.numEventsInWaitList > 0) { eventsWaitlist.resize(eventsRequest.numEventsInWaitList); auto size = eventsRequest.numEventsInWaitList * sizeof(cl_event); memcpy_s(&eventsWaitlist[0], size, eventsRequest.eventWaitList, size); this->eventsRequest.eventWaitList = &eventsWaitlist[0]; for (cl_uint i = 0; i < eventsRequest.numEventsInWaitList; i++) { auto event = castToObjectOrAbort(eventsRequest.eventWaitList[i]); event->incRefInternal(); } } } void Command::setTimestampPacketNode(TimestampPacketContainer ¤t, TimestampPacketDependencies &&dependencies) { currentTimestampPacketNodes = std::make_unique(); currentTimestampPacketNodes->assignAndIncrementNodesRefCounts(current); timestampPacketDependencies = std::make_unique(); *timestampPacketDependencies = std::move(dependencies); } Command::~Command() { if (terminated) { if (commandQueue.getTimestampPacketContainer()) { std::array timestampData; timestampData.fill(std::numeric_limits::max()); if (currentTimestampPacketNodes.get()) { for (auto &node : currentTimestampPacketNodes->peekNodes()) { for (const auto &cmdQueueNode : commandQueue.getTimestampPacketContainer()->peekNodes()) { if (node == cmdQueueNode) { for (uint32_t i = 0; i < node->getPacketsUsed(); i++) { node->assignDataToAllTimestamps(i, timestampData.data()); } } } } } } } else { if (commandQueue.getDeferredTimestampPackets() && timestampPacketDependencies.get()) { timestampPacketDependencies->moveNodesToNewContainer(*commandQueue.getDeferredTimestampPackets()); } } for (cl_event &eventFromWaitList : eventsWaitlist) { auto event = castToObjectOrAbort(eventFromWaitList); event->decRefInternal(); } } void Command::makeTimestampPacketsResident(CommandStreamReceiver &commandStreamReceiver) { if (commandStreamReceiver.peekTimestampPacketWriteEnabled()) { for (cl_event &eventFromWaitList : eventsWaitlist) { auto event = castToObjectOrAbort(eventFromWaitList); if (event->getTimestampPacketNodes() && event->getCommandQueue()->getClDevice().getRootDeviceIndex() == commandStreamReceiver.getRootDeviceIndex()) { event->getTimestampPacketNodes()->makeResident(commandStreamReceiver); } } } if (currentTimestampPacketNodes) { currentTimestampPacketNodes->makeResident(commandStreamReceiver); } if (timestampPacketDependencies) { timestampPacketDependencies->cacheFlushNodes.makeResident(commandStreamReceiver); timestampPacketDependencies->previousEnqueueNodes.makeResident(commandStreamReceiver); } } Command::Command(CommandQueue &commandQueue) : commandQueue(commandQueue) {} Command::Command(CommandQueue &commandQueue, std::unique_ptr &kernelOperation) : commandQueue(commandQueue), kernelOperation(std::move(kernelOperation)) {} } // namespace NEO