/* * Copyright (C) 2018-2022 Intel Corporation * * SPDX-License-Identifier: MIT * */ #pragma once #include "shared/source/built_ins/built_ins.h" #include "shared/source/command_stream/command_stream_receiver.h" #include "shared/source/command_stream/wait_status.h" #include "shared/source/helpers/engine_node_helper.h" #include "shared/source/helpers/pipe_control_args.h" #include "shared/source/memory_manager/internal_allocation_storage.h" #include "shared/source/memory_manager/memory_manager.h" #include "shared/source/memory_manager/surface.h" #include "shared/source/os_interface/os_context.h" #include "shared/source/program/sync_buffer_handler.h" #include "shared/source/program/sync_buffer_handler.inl" #include "shared/source/utilities/range.h" #include "shared/source/utilities/tag_allocator.h" #include "opencl/source/built_ins/builtins_dispatch_builder.h" #include "opencl/source/command_queue/command_queue_hw.h" #include "opencl/source/command_queue/gpgpu_walker.h" #include "opencl/source/command_queue/hardware_interface.h" #include "opencl/source/event/event_builder.h" #include "opencl/source/event/user_event.h" #include "opencl/source/gtpin/gtpin_notify.h" #include "opencl/source/helpers/cl_blit_properties.h" #include "opencl/source/helpers/cl_hw_helper.h" #include "opencl/source/helpers/cl_preemption_helper.h" #include "opencl/source/helpers/dispatch_info_builder.h" #include "opencl/source/helpers/enqueue_properties.h" #include "opencl/source/helpers/task_information.h" #include "opencl/source/mem_obj/buffer.h" #include "opencl/source/mem_obj/image.h" #include "opencl/source/memory_manager/migration_controller.h" #include "opencl/source/program/printf_handler.h" #include "opencl/source/utilities/cl_logger.h" #include #include namespace NEO { template template cl_int CommandQueueHw::enqueueHandler(Surface *(&surfaces)[surfaceCount], bool blocking, Kernel *kernel, cl_uint workDim, const size_t globalOffsets[3], const size_t workItems[3], const size_t *localWorkSizesIn, const size_t *enqueuedWorkSizes, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) { BuiltInOwnershipWrapper builtInLock; std::unique_ptr kernelObjsForAuxTranslation; MultiDispatchInfo multiDispatchInfo(kernel); auto auxTranslationMode = AuxTranslationMode::None; kernel->updateAuxTranslationRequired(); if (kernel->isAuxTranslationRequired()) { kernelObjsForAuxTranslation = kernel->fillWithKernelObjsForAuxTranslation(); if (!kernelObjsForAuxTranslation->empty()) { auxTranslationMode = HwHelperHw::get().getAuxTranslationMode(device->getHardwareInfo()); } multiDispatchInfo.setKernelObjsForAuxTranslation(std::move(kernelObjsForAuxTranslation)); } if (AuxTranslationMode::Builtin == auxTranslationMode) { auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(EBuiltInOps::AuxTranslation, getClDevice()); builtInLock.takeOwnership(builder, this->context); dispatchAuxTranslationBuiltin(multiDispatchInfo, AuxTranslationDirection::AuxToNonAux); } if (kernel->getKernelInfo().builtinDispatchBuilder == nullptr) { DispatchInfoBuilder builder(getClDevice()); builder.setDispatchGeometry(workDim, workItems, enqueuedWorkSizes, globalOffsets, Vec3{0, 0, 0}, localWorkSizesIn); builder.setKernel(kernel); builder.bake(multiDispatchInfo); } else { auto builder = kernel->getKernelInfo().builtinDispatchBuilder; builder->buildDispatchInfos(multiDispatchInfo, kernel, workDim, workItems, enqueuedWorkSizes, globalOffsets); if (multiDispatchInfo.size() == 0) { return CL_SUCCESS; } } if (AuxTranslationMode::Builtin == auxTranslationMode) { dispatchAuxTranslationBuiltin(multiDispatchInfo, AuxTranslationDirection::NonAuxToAux); } if (AuxTranslationMode::Blit == auxTranslationMode) { setupBlitAuxTranslation(multiDispatchInfo); } return enqueueHandler(surfaces, blocking, multiDispatchInfo, numEventsInWaitList, eventWaitList, event); } template template cl_int CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, size_t numSurfaceForResidency, bool blocking, const MultiDispatchInfo &multiDispatchInfo, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) { if (multiDispatchInfo.empty() && !isCommandWithoutKernel(commandType)) { const auto enqueueResult = enqueueHandler(nullptr, 0, blocking, multiDispatchInfo, numEventsInWaitList, eventWaitList, event); if (enqueueResult != CL_SUCCESS) { return enqueueResult; } if (event) { castToObjectOrAbort(*event)->setCmdType(commandType); } return CL_SUCCESS; } TagNodeBase *hwTimeStamps = nullptr; CommandStreamReceiver &computeCommandStreamReceiver = getGpgpuCommandStreamReceiver(); EventBuilder eventBuilder; setupEvent(eventBuilder, event, commandType); bool isMarkerWithProfiling = (CL_COMMAND_MARKER == commandType) && (eventBuilder.getEvent() && eventBuilder.getEvent()->isProfilingEnabled()); std::unique_ptr blockedCommandsData; std::unique_ptr printfHandler; TakeOwnershipWrapper> queueOwnership(*this); auto commandStreamReceiverOwnership = computeCommandStreamReceiver.obtainUniqueOwnership(); auto blockQueue = false; auto taskLevel = 0u; obtainTaskLevelAndBlockedStatus(taskLevel, numEventsInWaitList, eventWaitList, blockQueue, commandType); enqueueHandlerHook(commandType, multiDispatchInfo); bool clearDependenciesForSubCapture = false; aubCaptureHook(blocking, clearDependenciesForSubCapture, multiDispatchInfo); bool clearAllDependencies = (queueDependenciesClearRequired() || clearDependenciesForSubCapture); if (DebugManager.flags.MakeEachEnqueueBlocking.get()) { blocking = true; } TimestampPacketDependencies timestampPacketDependencies; EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, event); CsrDependencies csrDeps; BlitPropertiesContainer blitPropertiesContainer; if (this->context->getRootDeviceIndices().size() > 1) { eventsRequest.fillCsrDependenciesForTaskCountContainer(csrDeps, computeCommandStreamReceiver); } bool enqueueWithBlitAuxTranslation = isBlitAuxTranslationRequired(multiDispatchInfo); if (computeCommandStreamReceiver.peekTimestampPacketWriteEnabled()) { if (!clearDependenciesForSubCapture) { eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, computeCommandStreamReceiver, CsrDependencies::DependenciesType::OnCsr); } auto allocator = computeCommandStreamReceiver.getTimestampPacketAllocator(); size_t nodesCount = 0u; if (isCacheFlushCommand(commandType) || isMarkerWithProfiling) { nodesCount = 1; } else if (!multiDispatchInfo.empty()) { nodesCount = estimateTimestampPacketNodesCount(multiDispatchInfo); } if (isCacheFlushForBcsRequired() && enqueueWithBlitAuxTranslation) { // Cache flush for aux translation is always required (if supported) timestampPacketDependencies.cacheFlushNodes.add(allocator->getTag()); } if (nodesCount > 0) { obtainNewTimestampPacketNodes(nodesCount, timestampPacketDependencies.previousEnqueueNodes, clearAllDependencies, computeCommandStreamReceiver); csrDeps.timestampPacketContainer.push_back(×tampPacketDependencies.previousEnqueueNodes); } } auto &commandStream = *obtainCommandStream(csrDeps, false, blockQueue, multiDispatchInfo, eventsRequest, blockedCommandsData, surfacesForResidency, numSurfaceForResidency, isMarkerWithProfiling); auto commandStreamStart = commandStream.getUsed(); if (this->context->getRootDeviceIndices().size() > 1) { TimestampPacketHelper::programCsrDependenciesForForTaskCountContainer(commandStream, csrDeps); } if (enqueueWithBlitAuxTranslation) { processDispatchForBlitAuxTranslation(*getBcsForAuxTranslation(), multiDispatchInfo, blitPropertiesContainer, timestampPacketDependencies, eventsRequest, blockQueue); } if (eventBuilder.getEvent() && computeCommandStreamReceiver.peekTimestampPacketWriteEnabled()) { eventBuilder.getEvent()->addTimestampPacketNodes(*timestampPacketContainer); eventBuilder.getEvent()->addTimestampPacketNodes(timestampPacketDependencies.nonAuxToAuxNodes); eventBuilder.getEvent()->addTimestampPacketNodes(timestampPacketDependencies.auxToNonAuxNodes); } bool flushDependenciesForNonKernelCommand = false; if (multiDispatchInfo.empty() == false) { processDispatchForKernels(multiDispatchInfo, printfHandler, eventBuilder.getEvent(), hwTimeStamps, blockQueue, csrDeps, blockedCommandsData.get(), timestampPacketDependencies); } else if (isCacheFlushCommand(commandType)) { processDispatchForCacheFlush(surfacesForResidency, numSurfaceForResidency, &commandStream, csrDeps); } else if (computeCommandStreamReceiver.peekTimestampPacketWriteEnabled()) { if (CL_COMMAND_BARRIER == commandType) { computeCommandStreamReceiver.requestStallingCommandsOnNextFlush(); } for (size_t i = 0; i < eventsRequest.numEventsInWaitList; i++) { auto waitlistEvent = castToObjectOrAbort(eventsRequest.eventWaitList[i]); if (waitlistEvent->getTimestampPacketNodes()) { flushDependenciesForNonKernelCommand = true; if (eventBuilder.getEvent()) { eventBuilder.getEvent()->addTimestampPacketNodes(*waitlistEvent->getTimestampPacketNodes()); } } } if (isMarkerWithProfiling) { flushDependenciesForNonKernelCommand = true; } if (flushDependenciesForNonKernelCommand) { TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer(commandStream, csrDeps); } if (isMarkerWithProfiling) { if (numEventsInWaitList == 0) { computeCommandStreamReceiver.programComputeBarrierCommand(commandStream); } processDispatchForMarkerWithTimestampPacket(*this, &commandStream, eventsRequest, csrDeps); } } else if (isMarkerWithProfiling) { processDispatchForMarker(*this, &commandStream, eventsRequest, csrDeps); } CompletionStamp completionStamp = {CompletionStamp::notReady, taskLevel, 0}; const EnqueueProperties enqueueProperties(false, !multiDispatchInfo.empty(), isCacheFlushCommand(commandType), flushDependenciesForNonKernelCommand, isMarkerWithProfiling, &blitPropertiesContainer); if (!blockQueue && isOOQEnabled()) { setupBarrierTimestampForBcsEngines(computeCommandStreamReceiver.getOsContext().getEngineType(), timestampPacketDependencies); } bool migratedMemory = false; if (!blockQueue && multiDispatchInfo.peekMainKernel() && multiDispatchInfo.peekMainKernel()->requiresMemoryMigration()) { for (auto &arg : multiDispatchInfo.peekMainKernel()->getMemObjectsToMigrate()) { MigrationController::handleMigration(*this->context, computeCommandStreamReceiver, arg.second); migratedMemory = true; } } if (!blockQueue) { if (enqueueProperties.operation == EnqueueProperties::Operation::GpuKernel) { csrDeps.makeResident(computeCommandStreamReceiver); completionStamp = enqueueNonBlocked( surfacesForResidency, numSurfaceForResidency, commandStream, commandStreamStart, blocking, clearDependenciesForSubCapture, multiDispatchInfo, enqueueProperties, timestampPacketDependencies, eventsRequest, eventBuilder, taskLevel, printfHandler.get()); } else if (enqueueProperties.isFlushWithoutKernelRequired()) { completionStamp = enqueueCommandWithoutKernel( surfacesForResidency, numSurfaceForResidency, &commandStream, commandStreamStart, blocking, enqueueProperties, timestampPacketDependencies, eventsRequest, eventBuilder, taskLevel, csrDeps, nullptr); } else { UNRECOVERABLE_IF(enqueueProperties.operation != EnqueueProperties::Operation::EnqueueWithoutSubmission); auto maxTaskCountCurrentRootDevice = this->taskCount; for (auto eventId = 0u; eventId < numEventsInWaitList; eventId++) { auto event = castToObject(eventWaitList[eventId]); if (event->getCommandQueue() && event->getCommandQueue()->getDevice().getRootDeviceIndex() == this->getDevice().getRootDeviceIndex()) { maxTaskCountCurrentRootDevice = std::max(maxTaskCountCurrentRootDevice, event->peekTaskCount()); } } // inherit data from event_wait_list and previous packets completionStamp.flushStamp = this->flushStamp->peekStamp(); completionStamp.taskCount = maxTaskCountCurrentRootDevice; completionStamp.taskLevel = taskLevel; if (eventBuilder.getEvent() && isProfilingEnabled()) { eventBuilder.getEvent()->setSubmitTimeStamp(); eventBuilder.getEvent()->setStartTimeStamp(); } // check if we have BCS associated, if so we need to make sure it is completed as well if (eventBuilder.getEvent() && this->bcsEngineTypes.size() > 0u) { eventBuilder.getEvent()->setupBcs(this->getBcsCommandStreamReceiver(this->bcsEngineTypes[0u])->getOsContext().getEngineType()); } } if (eventBuilder.getEvent()) { eventBuilder.getEvent()->flushStamp->replaceStampObject(this->flushStamp->getStampReference()); } this->latestSentEnqueueType = enqueueProperties.operation; } if (completionStamp.taskCount == CompletionStamp::gpuHang) { return CL_OUT_OF_RESOURCES; } updateFromCompletionStamp(completionStamp, eventBuilder.getEvent()); if (blockQueue) { enqueueBlocked(commandType, surfacesForResidency, numSurfaceForResidency, multiDispatchInfo, timestampPacketDependencies, blockedCommandsData, enqueueProperties, eventsRequest, eventBuilder, std::move(printfHandler), nullptr); } if (deferredTimestampPackets.get()) { timestampPacketDependencies.moveNodesToNewContainer(*deferredTimestampPackets); csrDeps.copyNodesToNewContainer(*deferredTimestampPackets); } commandStreamReceiverOwnership.unlock(); queueOwnership.unlock(); if (blocking) { auto waitStatus = WaitStatus::Ready; auto &builtinOpParams = multiDispatchInfo.peekBuiltinOpParams(); if (builtinOpParams.userPtrForPostOperationCpuCopy) { waitStatus = waitForAllEngines(blockQueue, (blockQueue ? nullptr : printfHandler.get()), false); if (waitStatus == WaitStatus::GpuHang) { return CL_OUT_OF_RESOURCES; } auto hostPtrAlloc = builtinOpParams.transferAllocation; UNRECOVERABLE_IF(nullptr == hostPtrAlloc); auto size = hostPtrAlloc->getUnderlyingBufferSize(); [[maybe_unused]] int cpuCopyStatus = memcpy_s(builtinOpParams.userPtrForPostOperationCpuCopy, size, hostPtrAlloc->getUnderlyingBuffer(), size); DEBUG_BREAK_IF(cpuCopyStatus != 0); waitStatus = waitForAllEngines(blockQueue, (blockQueue ? nullptr : printfHandler.get()), true); } else { waitStatus = waitForAllEngines(blockQueue, (blockQueue ? nullptr : printfHandler.get()), true); } if (waitStatus == WaitStatus::GpuHang) { return CL_OUT_OF_RESOURCES; } } if (migratedMemory) { computeCommandStreamReceiver.flushBatchedSubmissions(); } return CL_SUCCESS; } template template void CommandQueueHw::processDispatchForKernels(const MultiDispatchInfo &multiDispatchInfo, std::unique_ptr &printfHandler, Event *event, TagNodeBase *&hwTimeStamps, bool blockQueue, CsrDependencies &csrDeps, KernelOperation *blockedCommandsData, TimestampPacketDependencies ×tampPacketDependencies) { TagNodeBase *hwPerfCounter = nullptr; getClFileLogger().dumpKernelArgs(&multiDispatchInfo); printfHandler.reset(PrintfHandler::create(multiDispatchInfo, *device)); if (printfHandler) { printfHandler->prepareDispatch(multiDispatchInfo); } if (multiDispatchInfo.peekMainKernel()->usesSyncBuffer()) { auto &gws = multiDispatchInfo.begin()->getGWS(); auto &lws = multiDispatchInfo.begin()->getLocalWorkgroupSize(); size_t workGroupsCount = (gws.x * gws.y * gws.z) / (lws.x * lws.y * lws.z); device->getDevice().syncBufferHandler->prepareForEnqueue(workGroupsCount, *multiDispatchInfo.peekMainKernel()); } if (commandType == CL_COMMAND_NDRANGE_KERNEL) { if (multiDispatchInfo.peekMainKernel()->isKernelDebugEnabled()) { setupDebugSurface(multiDispatchInfo.peekMainKernel()); } } if (event && this->isProfilingEnabled()) { // Get allocation for timestamps hwTimeStamps = event->getHwTimeStampNode(); } if (event && this->isPerfCountersEnabled()) { hwPerfCounter = event->getHwPerfCounterNode(); } HardwareInterfaceWalkerArgs dispatchWalkerArgs = {}; dispatchWalkerArgs.blockedCommandsData = blockedCommandsData; dispatchWalkerArgs.hwTimeStamps = hwTimeStamps; dispatchWalkerArgs.hwPerfCounter = hwPerfCounter; dispatchWalkerArgs.timestampPacketDependencies = ×tampPacketDependencies; dispatchWalkerArgs.currentTimestampPacketNodes = timestampPacketContainer.get(); dispatchWalkerArgs.commandType = commandType; dispatchWalkerArgs.event = event; HardwareInterface::dispatchWalker( *this, multiDispatchInfo, csrDeps, dispatchWalkerArgs); if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) { for (auto &dispatchInfo : multiDispatchInfo) { for (auto &patchInfoData : dispatchInfo.getKernel()->getPatchInfoDataList()) { getGpgpuCommandStreamReceiver().getFlatBatchBufferHelper().setPatchInfoData(patchInfoData); } } } getGpgpuCommandStreamReceiver().setRequiredScratchSizes(multiDispatchInfo.getRequiredScratchSize(), multiDispatchInfo.getRequiredPrivateScratchSize()); } template BlitProperties CommandQueueHw::processDispatchForBlitEnqueue(CommandStreamReceiver &blitCommandStreamReceiver, const MultiDispatchInfo &multiDispatchInfo, TimestampPacketDependencies ×tampPacketDependencies, const EventsRequest &eventsRequest, LinearStream *commandStream, uint32_t commandType, bool queueBlocked) { auto blitDirection = ClBlitProperties::obtainBlitDirection(commandType); auto blitProperties = ClBlitProperties::constructProperties(blitDirection, blitCommandStreamReceiver, multiDispatchInfo.peekBuiltinOpParams()); if (!queueBlocked) { eventsRequest.fillCsrDependenciesForTimestampPacketContainer(blitProperties.csrDependencies, blitCommandStreamReceiver, CsrDependencies::DependenciesType::All); blitProperties.csrDependencies.timestampPacketContainer.push_back(×tampPacketDependencies.cacheFlushNodes); blitProperties.csrDependencies.timestampPacketContainer.push_back(×tampPacketDependencies.previousEnqueueNodes); blitProperties.csrDependencies.timestampPacketContainer.push_back(×tampPacketDependencies.barrierNodes); } auto currentTimestampPacketNode = timestampPacketContainer->peekNodes().at(0); blitProperties.outputTimestampPacket = currentTimestampPacketNode; if (commandStream) { if (timestampPacketDependencies.cacheFlushNodes.peekNodes().size() > 0) { auto cacheFlushTimestampPacketGpuAddress = TimestampPacketHelper::getContextEndGpuAddress(*timestampPacketDependencies.cacheFlushNodes.peekNodes()[0]); const auto &hwInfo = device->getHardwareInfo(); PipeControlArgs args; args.dcFlushEnable = MemorySynchronizationCommands::getDcFlushEnable(true, hwInfo); MemorySynchronizationCommands::addBarrierWithPostSyncOperation( *commandStream, PostSyncMode::ImmediateData, cacheFlushTimestampPacketGpuAddress, 0, hwInfo, args); } } return blitProperties; } template void CommandQueueHw::processDispatchForBlitAuxTranslation(CommandStreamReceiver &bcsCsr, const MultiDispatchInfo &multiDispatchInfo, BlitPropertiesContainer &blitPropertiesContainer, TimestampPacketDependencies ×tampPacketDependencies, const EventsRequest &eventsRequest, bool queueBlocked) { const auto rootDeviceIndex = getDevice().getRootDeviceIndex(); const auto nodesAllocator = getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(); const auto numKernelObjs = multiDispatchInfo.getKernelObjsForAuxTranslation()->size(); blitPropertiesContainer.resize(numKernelObjs * 2); auto bufferIndex = 0; for (const auto &kernelObj : *multiDispatchInfo.getKernelObjsForAuxTranslation()) { GraphicsAllocation *allocation = nullptr; if (kernelObj.type == KernelObjForAuxTranslation::Type::MEM_OBJ) { const auto buffer = static_cast(kernelObj.object); allocation = buffer->getGraphicsAllocation(rootDeviceIndex); } else { DEBUG_BREAK_IF(kernelObj.type != KernelObjForAuxTranslation::Type::GFX_ALLOC); allocation = static_cast(kernelObj.object); } { // Aux to NonAux blitPropertiesContainer[bufferIndex] = BlitProperties::constructPropertiesForAuxTranslation( AuxTranslationDirection::AuxToNonAux, allocation, getGpgpuCommandStreamReceiver().getClearColorAllocation()); const auto auxToNonAuxNode = nodesAllocator->getTag(); timestampPacketDependencies.auxToNonAuxNodes.add(auxToNonAuxNode); } { // NonAux to Aux blitPropertiesContainer[bufferIndex + numKernelObjs] = BlitProperties::constructPropertiesForAuxTranslation( AuxTranslationDirection::NonAuxToAux, allocation, getGpgpuCommandStreamReceiver().getClearColorAllocation()); const auto nonAuxToAuxNode = nodesAllocator->getTag(); timestampPacketDependencies.nonAuxToAuxNodes.add(nonAuxToAuxNode); } bufferIndex++; } if (!queueBlocked) { CsrDependencies csrDeps; eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, bcsCsr, CsrDependencies::DependenciesType::All); BlitProperties::setupDependenciesForAuxTranslation(blitPropertiesContainer, timestampPacketDependencies, *this->timestampPacketContainer, csrDeps, getGpgpuCommandStreamReceiver(), bcsCsr); } eventsRequest.setupBcsCsrForOutputEvent(bcsCsr); } template void CommandQueueHw::processDispatchForCacheFlush(Surface **surfaces, size_t numSurfaces, LinearStream *commandStream, CsrDependencies &csrDeps) { TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer(*commandStream, csrDeps); uint64_t postSyncAddress = 0; if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { auto timestampPacketNodeForPostSync = timestampPacketContainer->peekNodes().at(0); timestampPacketNodeForPostSync->setProfilingCapable(false); postSyncAddress = TimestampPacketHelper::getContextEndGpuAddress(*timestampPacketNodeForPostSync); } submitCacheFlush(surfaces, numSurfaces, commandStream, postSyncAddress); } template void CommandQueueHw::processDispatchForMarker(CommandQueue &commandQueue, LinearStream *commandStream, EventsRequest &eventsRequest, CsrDependencies &csrDeps) { auto event = castToObjectOrAbort(*eventsRequest.outEvent); TagNodeBase *hwTimeStamps = nullptr; TagNodeBase *hwPerfCounter = nullptr; hwTimeStamps = event->getHwTimeStampNode(); HardwareInterface::dispatchProfilingPerfStartCommands(hwTimeStamps, hwPerfCounter, commandStream, commandQueue); HardwareInterface::dispatchProfilingPerfEndCommands(hwTimeStamps, hwPerfCounter, commandStream, commandQueue); getGpgpuCommandStreamReceiver().makeResident(*hwTimeStamps->getBaseGraphicsAllocation()); } template void CommandQueueHw::processDispatchForMarkerWithTimestampPacket(CommandQueue &commandQueue, LinearStream *commandStream, EventsRequest &eventsRequest, CsrDependencies &csrDeps) { auto currentTimestampPacketNode = commandQueue.getTimestampPacketContainer()->peekNodes().at(0); auto timestampContextStartGpuAddress = TimestampPacketHelper::getContextStartGpuAddress(*currentTimestampPacketNode); auto timestampGlobalStartAddress = TimestampPacketHelper::getGlobalStartGpuAddress(*currentTimestampPacketNode); EncodeStoreMMIO::encode(*commandStream, GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, timestampContextStartGpuAddress, false); EncodeStoreMMIO::encode(*commandStream, REG_GLOBAL_TIMESTAMP_LDW, timestampGlobalStartAddress, false); auto timestampContextEndGpuAddress = TimestampPacketHelper::getContextEndGpuAddress(*currentTimestampPacketNode); auto timestampGlobalEndAddress = TimestampPacketHelper::getGlobalEndGpuAddress(*currentTimestampPacketNode); EncodeStoreMMIO::encode(*commandStream, GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, timestampContextEndGpuAddress, false); EncodeStoreMMIO::encode(*commandStream, REG_GLOBAL_TIMESTAMP_LDW, timestampGlobalEndAddress, false); } template void CommandQueueHw::obtainTaskLevelAndBlockedStatus(unsigned int &taskLevel, cl_uint &numEventsInWaitList, const cl_event *&eventWaitList, bool &blockQueueStatus, unsigned int commandType) { auto isQueueBlockedStatus = isQueueBlocked(); taskLevel = getTaskLevelFromWaitList(this->taskLevel, numEventsInWaitList, eventWaitList); blockQueueStatus = (taskLevel == CompletionStamp::notReady) || isQueueBlockedStatus; auto taskLevelUpdateRequired = isTaskLevelUpdateRequired(taskLevel, eventWaitList, numEventsInWaitList, commandType); if (taskLevelUpdateRequired) { taskLevel++; this->taskLevel = taskLevel; } DBG_LOG(EventsDebugEnable, "blockQueue", blockQueueStatus, "virtualEvent", virtualEvent, "taskLevel", taskLevel); } template bool CommandQueueHw::isTaskLevelUpdateRequired(const uint32_t &taskLevel, const cl_event *eventWaitList, const cl_uint &numEventsInWaitList, unsigned int commandType) { bool updateTaskLevel = true; // if we are blocked by user event then no update if (taskLevel == CompletionStamp::notReady) { updateTaskLevel = false; } // if we are executing command without kernel then it will inherit state from // previous commands, barrier is exception if (isCommandWithoutKernel(commandType) && commandType != CL_COMMAND_BARRIER) { updateTaskLevel = false; } // ooq special cases starts here if (this->isOOQEnabled()) { // if no wait list and barrier , do not update task level if (eventWaitList == nullptr && commandType != CL_COMMAND_BARRIER) { updateTaskLevel = false; } // if we have waitlist then deduce task level from waitlist and check if it is higher then current task level of queue if (eventWaitList != nullptr) { auto taskLevelFromEvents = getTaskLevelFromWaitList(0, numEventsInWaitList, eventWaitList); taskLevelFromEvents++; if (taskLevelFromEvents <= this->taskLevel) { updateTaskLevel = false; } } } return updateTaskLevel; } template template CompletionStamp CommandQueueHw::enqueueNonBlocked( Surface **surfaces, size_t surfaceCount, LinearStream &commandStream, size_t commandStreamStart, bool &blocking, bool clearDependenciesForSubCapture, const MultiDispatchInfo &multiDispatchInfo, const EnqueueProperties &enqueueProperties, TimestampPacketDependencies ×tampPacketDependencies, EventsRequest &eventsRequest, EventBuilder &eventBuilder, uint32_t taskLevel, PrintfHandler *printfHandler) { UNRECOVERABLE_IF(multiDispatchInfo.empty()); auto implicitFlush = false; if (printfHandler) { blocking = true; printfHandler->makeResident(getGpgpuCommandStreamReceiver()); } if (multiDispatchInfo.peekMainKernel()->usesSyncBuffer()) { device->getDevice().syncBufferHandler->makeResident(getGpgpuCommandStreamReceiver()); } if (timestampPacketContainer) { timestampPacketContainer->makeResident(getGpgpuCommandStreamReceiver()); timestampPacketDependencies.previousEnqueueNodes.makeResident(getGpgpuCommandStreamReceiver()); timestampPacketDependencies.cacheFlushNodes.makeResident(getGpgpuCommandStreamReceiver()); } bool anyUncacheableArgs = false; auto requiresCoherency = false; for (auto surface : createRange(surfaces, surfaceCount)) { surface->makeResident(getGpgpuCommandStreamReceiver()); requiresCoherency |= surface->IsCoherent; if (!surface->allowsL3Caching()) { anyUncacheableArgs = true; } } auto mediaSamplerRequired = false; uint32_t numGrfRequired = GrfConfig::DefaultGrfNumber; auto systolicPipelineSelectMode = false; Kernel *kernel = nullptr; bool auxTranslationRequired = false; bool useGlobalAtomics = false; for (auto &dispatchInfo : multiDispatchInfo) { if (kernel != dispatchInfo.getKernel()) { kernel = dispatchInfo.getKernel(); } else { continue; } kernel->makeResident(getGpgpuCommandStreamReceiver()); requiresCoherency |= kernel->requiresCoherency(); mediaSamplerRequired |= kernel->isVmeKernel(); auto numGrfRequiredByKernel = static_cast(kernel->getKernelInfo().kernelDescriptor.kernelAttributes.numGrfRequired); numGrfRequired = std::max(numGrfRequired, numGrfRequiredByKernel); systolicPipelineSelectMode |= kernel->requiresSystolicPipelineSelectMode(); auxTranslationRequired |= kernel->isAuxTranslationRequired(); if (kernel->hasUncacheableStatelessArgs()) { anyUncacheableArgs = true; } if (kernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics) { useGlobalAtomics = true; } } if (mediaSamplerRequired) { DEBUG_BREAK_IF(device->getDeviceInfo().preemptionSupported != false); } if (isProfilingEnabled() && eventBuilder.getEvent()) { eventBuilder.getEvent()->setSubmitTimeStamp(); auto hwTimestampNode = eventBuilder.getEvent()->getHwTimeStampNode(); if (hwTimestampNode) { getGpgpuCommandStreamReceiver().makeResident(*hwTimestampNode->getBaseGraphicsAllocation()); } if (isPerfCountersEnabled()) { getGpgpuCommandStreamReceiver().makeResident(*eventBuilder.getEvent()->getHwPerfCounterNode()->getBaseGraphicsAllocation()); } } IndirectHeap *dsh = nullptr; IndirectHeap *ioh = nullptr; dsh = &getIndirectHeap(IndirectHeap::Type::DYNAMIC_STATE, 0u); ioh = &getIndirectHeap(IndirectHeap::Type::INDIRECT_OBJECT, 0u); auto allocNeedsFlushDC = false; if (!device->isFullRangeSvm()) { if (std::any_of(getGpgpuCommandStreamReceiver().getResidencyAllocations().begin(), getGpgpuCommandStreamReceiver().getResidencyAllocations().end(), [](const auto allocation) { return allocation->isFlushL3Required(); })) { allocNeedsFlushDC = true; } } auto memoryCompressionState = getGpgpuCommandStreamReceiver().getMemoryCompressionState(auxTranslationRequired, device->getHardwareInfo()); DispatchFlags dispatchFlags( {}, // csrDependencies ×tampPacketDependencies.barrierNodes, // barrierTimestampPacketNodes {}, // pipelineSelectArgs this->flushStamp->getStampReference(), // flushStampReference getThrottle(), // throttle ClPreemptionHelper::taskPreemptionMode(getDevice(), multiDispatchInfo), // preemptionMode numGrfRequired, // numGrfRequired L3CachingSettings::l3CacheOn, // l3CacheSettings kernel->getDescriptor().kernelAttributes.threadArbitrationPolicy, // threadArbitrationPolicy kernel->getAdditionalKernelExecInfo(), // additionalKernelExecInfo kernel->getExecutionType(), // kernelExecutionType memoryCompressionState, // memoryCompressionState getSliceCount(), // sliceCount blocking, // blocking shouldFlushDC(commandType, printfHandler) || allocNeedsFlushDC, // dcFlush multiDispatchInfo.usesSlm(), // useSLM !getGpgpuCommandStreamReceiver().isUpdateTagFromWaitEnabled(), // guardCommandBufferWithPipeControl commandType == CL_COMMAND_NDRANGE_KERNEL, // GSBA32BitRequired requiresCoherency, // requiresCoherency (QueuePriority::LOW == priority), // lowPriority implicitFlush, // implicitFlush !eventBuilder.getEvent() || getGpgpuCommandStreamReceiver().isNTo1SubmissionModelEnabled(), // outOfOrderExecutionAllowed false, // epilogueRequired false, // usePerDssBackedBuffer kernel->isSingleSubdevicePreferred(), // useSingleSubdevice useGlobalAtomics, // useGlobalAtomics kernel->areMultipleSubDevicesInContext(), // areMultipleSubDevicesInContext kernel->requiresMemoryMigration(), // memoryMigrationRequired isTextureCacheFlushNeeded(commandType)); // textureCacheFlush dispatchFlags.pipelineSelectArgs.mediaSamplerRequired = mediaSamplerRequired; dispatchFlags.pipelineSelectArgs.systolicPipelineSelectMode = systolicPipelineSelectMode; dispatchFlags.disableEUFusion = kernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.requiresDisabledEUFusion; const bool isHandlingBarrier = getGpgpuCommandStreamReceiver().isStallingCommandsOnNextFlushRequired(); if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled() && !clearDependenciesForSubCapture) { eventsRequest.fillCsrDependenciesForTimestampPacketContainer(dispatchFlags.csrDependencies, getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OutOfCsr); if (isHandlingBarrier) { fillCsrDependenciesWithLastBcsPackets(dispatchFlags.csrDependencies); } dispatchFlags.csrDependencies.makeResident(getGpgpuCommandStreamReceiver()); } DEBUG_BREAK_IF(taskLevel >= CompletionStamp::notReady); if (anyUncacheableArgs) { dispatchFlags.l3CacheSettings = L3CachingSettings::l3CacheOff; } else if (!kernel->areStatelessWritesUsed()) { dispatchFlags.l3CacheSettings = L3CachingSettings::l3AndL1On; } if (this->dispatchHints != 0) { dispatchFlags.engineHints = this->dispatchHints; dispatchFlags.epilogueRequired = true; } if (gtpinIsGTPinInitialized()) { gtpinNotifyPreFlushTask(this); } if (enqueueProperties.blitPropertiesContainer->size() > 0) { auto bcsCsr = getBcsForAuxTranslation(); const auto newTaskCount = bcsCsr->flushBcsTask(*enqueueProperties.blitPropertiesContainer, false, this->isProfilingEnabled(), getDevice()); if (!newTaskCount) { CompletionStamp completionStamp{}; completionStamp.taskCount = CompletionStamp::gpuHang; return completionStamp; } this->updateBcsTaskCount(bcsCsr->getOsContext().getEngineType(), *newTaskCount); dispatchFlags.implicitFlush = true; } PRINT_DEBUG_STRING(DebugManager.flags.PrintDebugMessages.get(), stdout, "preemption = %d.\n", static_cast(dispatchFlags.preemptionMode)); CompletionStamp completionStamp = getGpgpuCommandStreamReceiver().flushTask( commandStream, commandStreamStart, dsh, ioh, &getIndirectHeap(IndirectHeap::Type::SURFACE_STATE, 0u), taskLevel, dispatchFlags, getDevice()); if (isHandlingBarrier) { clearLastBcsPackets(); } if (gtpinIsGTPinInitialized()) { gtpinNotifyFlushTask(completionStamp.taskCount); } return completionStamp; } template void CommandQueueHw::enqueueBlocked( uint32_t commandType, Surface **surfaces, size_t surfaceCount, const MultiDispatchInfo &multiDispatchInfo, TimestampPacketDependencies ×tampPacketDependencies, std::unique_ptr &blockedCommandsData, const EnqueueProperties &enqueueProperties, EventsRequest &eventsRequest, EventBuilder &externalEventBuilder, std::unique_ptr &&printfHandler, CommandStreamReceiver *bcsCsr) { TakeOwnershipWrapper> queueOwnership(*this); // store previous virtual event as it will add dependecies to new virtual event if (this->virtualEvent) { DBG_LOG(EventsDebugEnable, "enqueueBlocked", "previousVirtualEvent", this->virtualEvent); } EventBuilder internalEventBuilder; EventBuilder *eventBuilder; // check if event will be exposed externally if (externalEventBuilder.getEvent()) { externalEventBuilder.getEvent()->incRefInternal(); eventBuilder = &externalEventBuilder; DBG_LOG(EventsDebugEnable, "enqueueBlocked", "output event as virtualEvent", virtualEvent); } else { // it will be an internal event internalEventBuilder.create(this, context); eventBuilder = &internalEventBuilder; DBG_LOG(EventsDebugEnable, "enqueueBlocked", "new virtualEvent", eventBuilder->getEvent()); } auto outEvent = eventBuilder->getEvent(); // update queue taskCount taskCount = outEvent->getCompletionStamp(); std::unique_ptr command; bool storeTimestampPackets = false; if (blockedCommandsData) { if (enqueueProperties.blitPropertiesContainer) { blockedCommandsData->blitPropertiesContainer = *enqueueProperties.blitPropertiesContainer; blockedCommandsData->bcsCsr = bcsCsr; blockedCommandsData->blitEnqueue = true; } storeTimestampPackets = (timestampPacketContainer != nullptr); } if (enqueueProperties.operation != EnqueueProperties::Operation::GpuKernel) { command = std::make_unique(*this, blockedCommandsData); } else { // store task data in event std::vector allSurfaces; Kernel *kernel = nullptr; for (auto &dispatchInfo : multiDispatchInfo) { if (kernel != dispatchInfo.getKernel()) { kernel = dispatchInfo.getKernel(); } else { continue; } kernel->getResidency(allSurfaces); } allSurfaces.reserve(allSurfaces.size() + surfaceCount); for (auto &surface : createRange(surfaces, surfaceCount)) { allSurfaces.push_back(surface->duplicate()); } PreemptionMode preemptionMode = ClPreemptionHelper::taskPreemptionMode(getDevice(), multiDispatchInfo); bool slmUsed = multiDispatchInfo.usesSlm(); command = std::make_unique(*this, blockedCommandsData, std::move(allSurfaces), shouldFlushDC(commandType, printfHandler.get()), slmUsed, commandType, std::move(printfHandler), preemptionMode, multiDispatchInfo.peekMainKernel(), (uint32_t)multiDispatchInfo.size()); } if (storeTimestampPackets) { command->setTimestampPacketNode(*timestampPacketContainer, std::move(timestampPacketDependencies)); command->setEventsRequest(eventsRequest); } else if (this->context->getRootDeviceIndices().size() > 1) { command->setEventsRequest(eventsRequest); } outEvent->setCommand(std::move(command)); eventBuilder->addParentEvents(ArrayRef(eventsRequest.eventWaitList, eventsRequest.numEventsInWaitList)); eventBuilder->addParentEvent(this->virtualEvent); eventBuilder->finalize(); if (this->virtualEvent) { this->virtualEvent->decRefInternal(); } this->virtualEvent = outEvent; } template CompletionStamp CommandQueueHw::enqueueCommandWithoutKernel( Surface **surfaces, size_t surfaceCount, LinearStream *commandStream, size_t commandStreamStart, bool &blocking, const EnqueueProperties &enqueueProperties, TimestampPacketDependencies ×tampPacketDependencies, EventsRequest &eventsRequest, EventBuilder &eventBuilder, uint32_t taskLevel, CsrDependencies &csrDeps, CommandStreamReceiver *bcsCsr) { CompletionStamp completionStamp = {this->taskCount, this->taskLevel, this->flushStamp->peekStamp()}; bool flushGpgpuCsr = true; if ((enqueueProperties.operation == EnqueueProperties::Operation::Blit) && commandStream == nullptr) { flushGpgpuCsr = false; } else { csrDeps.makeResident(getGpgpuCommandStreamReceiver()); } if (eventBuilder.getEvent() && isProfilingEnabled()) { eventBuilder.getEvent()->setSubmitTimeStamp(); eventBuilder.getEvent()->setStartTimeStamp(); } if (flushGpgpuCsr) { if (timestampPacketContainer) { timestampPacketContainer->makeResident(getGpgpuCommandStreamReceiver()); timestampPacketDependencies.previousEnqueueNodes.makeResident(getGpgpuCommandStreamReceiver()); timestampPacketDependencies.cacheFlushNodes.makeResident(getGpgpuCommandStreamReceiver()); } for (auto surface : createRange(surfaces, surfaceCount)) { surface->makeResident(getGpgpuCommandStreamReceiver()); } auto rootDeviceIndex = getDevice().getRootDeviceIndex(); DispatchFlags dispatchFlags( {}, // csrDependencies ×tampPacketDependencies.barrierNodes, // barrierTimestampPacketNodes {}, // pipelineSelectArgs flushStamp->getStampReference(), // flushStampReference getThrottle(), // throttle device->getPreemptionMode(), // preemptionMode GrfConfig::NotApplicable, // numGrfRequired L3CachingSettings::NotApplicable, // l3CacheSettings ThreadArbitrationPolicy::NotPresent, // threadArbitrationPolicy AdditionalKernelExecInfo::NotApplicable, // additionalKernelExecInfo KernelExecutionType::NotApplicable, // kernelExecutionType MemoryCompressionState::NotApplicable, // memoryCompressionState getSliceCount(), // sliceCount blocking, // blocking false, // dcFlush false, // useSLM !getGpgpuCommandStreamReceiver().isUpdateTagFromWaitEnabled(), // guardCommandBufferWithPipeControl false, // GSBA32BitRequired false, // requiresCoherency false, // lowPriority (enqueueProperties.operation == EnqueueProperties::Operation::Blit), // implicitFlush getGpgpuCommandStreamReceiver().isNTo1SubmissionModelEnabled(), // outOfOrderExecutionAllowed false, // epilogueRequired false, // usePerDssBackedBuffer false, // useSingleSubdevice false, // useGlobalAtomics context->containsMultipleSubDevices(rootDeviceIndex), // areMultipleSubDevicesInContext false, // memoryMigrationRequired false); // textureCacheFlush const bool isHandlingBarrier = getGpgpuCommandStreamReceiver().isStallingCommandsOnNextFlushRequired(); if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { eventsRequest.fillCsrDependenciesForTimestampPacketContainer(dispatchFlags.csrDependencies, getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OutOfCsr); if (isHandlingBarrier) { fillCsrDependenciesWithLastBcsPackets(dispatchFlags.csrDependencies); } dispatchFlags.csrDependencies.makeResident(getGpgpuCommandStreamReceiver()); } completionStamp = getGpgpuCommandStreamReceiver().flushTask( *commandStream, commandStreamStart, &getIndirectHeap(IndirectHeap::Type::DYNAMIC_STATE, 0u), &getIndirectHeap(IndirectHeap::Type::INDIRECT_OBJECT, 0u), &getIndirectHeap(IndirectHeap::Type::SURFACE_STATE, 0u), taskLevel, dispatchFlags, getDevice()); if (isHandlingBarrier) { clearLastBcsPackets(); } } if (enqueueProperties.operation == EnqueueProperties::Operation::Blit) { UNRECOVERABLE_IF(!enqueueProperties.blitPropertiesContainer); const auto newTaskCount = bcsCsr->flushBcsTask(*enqueueProperties.blitPropertiesContainer, false, this->isProfilingEnabled(), getDevice()); if (!newTaskCount) { CompletionStamp completionStamp{}; completionStamp.taskCount = CompletionStamp::gpuHang; return completionStamp; } this->updateBcsTaskCount(bcsCsr->getOsContext().getEngineType(), *newTaskCount); } return completionStamp; } template void CommandQueueHw::computeOffsetsValueForRectCommands(size_t *bufferOffset, size_t *hostOffset, const size_t *bufferOrigin, const size_t *hostOrigin, const size_t *region, size_t bufferRowPitch, size_t bufferSlicePitch, size_t hostRowPitch, size_t hostSlicePitch) { size_t computedBufferRowPitch = bufferRowPitch ? bufferRowPitch : region[0]; size_t computedBufferSlicePitch = bufferSlicePitch ? bufferSlicePitch : region[1] * computedBufferRowPitch; size_t computedHostRowPitch = hostRowPitch ? hostRowPitch : region[0]; size_t computedHostSlicePitch = hostSlicePitch ? hostSlicePitch : region[1] * computedHostRowPitch; *bufferOffset = bufferOrigin[2] * computedBufferSlicePitch + bufferOrigin[1] * computedBufferRowPitch + bufferOrigin[0]; *hostOffset = hostOrigin[2] * computedHostSlicePitch + hostOrigin[1] * computedHostRowPitch + hostOrigin[0]; } template size_t CommandQueueHw::calculateHostPtrSizeForImage(const size_t *region, size_t rowPitch, size_t slicePitch, Image *image) { auto bytesPerPixel = image->getSurfaceFormatInfo().surfaceFormat.ImageElementSizeInBytes; auto dstRowPitch = rowPitch ? rowPitch : region[0] * bytesPerPixel; auto dstSlicePitch = slicePitch ? slicePitch : ((image->getImageDesc().image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY ? 1 : region[1]) * dstRowPitch); return Image::calculateHostPtrSize(region, dstRowPitch, dstSlicePitch, bytesPerPixel, image->getImageDesc().image_type); } template bool CommandQueueHw::isSplitEnqueueBlitNeeded(TransferDirection transferDirection, size_t transferSize, CommandStreamReceiver &csr) { constexpr size_t minimalSizeForBcsSplit = 16 * MemoryConstants::megaByte; auto bcsSplit = getDevice().isBcsSplitSupported() && csr.getOsContext().getEngineType() == aub_stream::EngineType::ENGINE_BCS && transferSize >= minimalSizeForBcsSplit && (transferDirection == TransferDirection::HostToLocal || transferDirection == TransferDirection::LocalToHost); if (bcsSplit) { this->constructBcsEnginesForSplit(); } return bcsSplit; } template size_t CommandQueueHw::getTotalSizeFromRectRegion(const size_t *region) { auto size = region[0]; size *= (region[1] == 0 ? 1 : region[1]); size *= (region[2] == 0 ? 1 : region[2]); return size; } template template cl_int CommandQueueHw::enqueueBlitSplit(MultiDispatchInfo &dispatchInfo, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event, bool blocking, CommandStreamReceiver &csr) { auto ret = CL_SUCCESS; this->releaseMainCopyEngine(); StackVec, 4u> locks; StackVec copyEngines; for (uint32_t i = 0; i < bcsInfoMaskSize; i++) { if (this->splitEngines.test(i)) { auto engineType = EngineHelpers::mapBcsIndexToEngineType(i, true); auto bcs = getBcsCommandStreamReceiver(engineType); if (bcs) { locks.push_back(std::move(bcs->obtainUniqueOwnership())); copyEngines.push_back(bcs); } } } DEBUG_BREAK_IF(copyEngines.size() == 0); TakeOwnershipWrapper> queueOwnership(*this); TimestampPacketContainer splitNodes; TimestampPacketContainer previousEnqueueNode; previousEnqueueNode.swapNodes(*this->timestampPacketContainer); auto srcOffset = dispatchInfo.peekBuiltinOpParams().srcOffset.x; auto dstOffset = dispatchInfo.peekBuiltinOpParams().dstOffset.x; auto size = dispatchInfo.peekBuiltinOpParams().size.x; auto remainingSize = size; for (size_t i = 0; i < copyEngines.size(); i++) { auto localSize = remainingSize / (copyEngines.size() - i); auto localParams = dispatchInfo.peekBuiltinOpParams(); localParams.size.x = localSize; localParams.srcOffset.x = (srcOffset + size - remainingSize); localParams.dstOffset.x = (dstOffset + size - remainingSize); dispatchInfo.setBuiltinOpParams(localParams); remainingSize -= localSize; this->timestampPacketContainer->assignAndIncrementNodesRefCounts(previousEnqueueNode); ret = enqueueBlit(dispatchInfo, numEventsInWaitList, eventWaitList, remainingSize == 0 ? event : nullptr, false, *copyEngines[i]); DEBUG_BREAK_IF(ret != CL_SUCCESS); this->timestampPacketContainer->moveNodesToNewContainer(splitNodes); } if (event) { auto e = castToObjectOrAbort(*event); e->addTimestampPacketNodes(splitNodes); } this->timestampPacketContainer->swapNodes(splitNodes); queueOwnership.unlock(); for (auto &lock : locks) { lock.unlock(); } if (blocking) { ret = this->finish(); } return ret; } template template cl_int CommandQueueHw::enqueueBlit(const MultiDispatchInfo &multiDispatchInfo, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event, bool blocking, CommandStreamReceiver &bcsCsr) { auto bcsCommandStreamReceiverOwnership = bcsCsr.obtainUniqueOwnership(); std::unique_lock commandStreamReceiverOwnership; EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, event); EventBuilder eventBuilder; setupEvent(eventBuilder, eventsRequest.outEvent, cmdType); eventsRequest.setupBcsCsrForOutputEvent(bcsCsr); std::unique_ptr blockedCommandsData; TakeOwnershipWrapper> queueOwnership(*this); if (DebugManager.flags.ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission.get() != 1) { commandStreamReceiverOwnership = getGpgpuCommandStreamReceiver().obtainUniqueOwnership(); } auto blockQueue = false; auto taskLevel = 0u; obtainTaskLevelAndBlockedStatus(taskLevel, eventsRequest.numEventsInWaitList, eventsRequest.eventWaitList, blockQueue, cmdType); auto clearAllDependencies = queueDependenciesClearRequired(); enqueueHandlerHook(cmdType, multiDispatchInfo); aubCaptureHook(blocking, clearAllDependencies, multiDispatchInfo); if (DebugManager.flags.MakeEachEnqueueBlocking.get()) { blocking = true; } TimestampPacketDependencies timestampPacketDependencies; BlitPropertiesContainer blitPropertiesContainer; CsrDependencies csrDeps; eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, bcsCsr, CsrDependencies::DependenciesType::All); auto allocator = bcsCsr.getTimestampPacketAllocator(); if (!blockQueue) { setupBarrierTimestampForBcsEngines(bcsCsr.getOsContext().getEngineType(), timestampPacketDependencies); if (isOOQEnabled()) { TimestampPacketContainer clearBarrierNodes; timestampPacketDependencies.barrierNodes.swapNodes(clearBarrierNodes); } } processBarrierTimestampForBcsEngine(bcsCsr.getOsContext().getEngineType(), timestampPacketDependencies); auto gpgpuSubmission = isGpgpuSubmissionForBcsRequired(blockQueue, timestampPacketDependencies); if (isCacheFlushForBcsRequired() && gpgpuSubmission) { timestampPacketDependencies.cacheFlushNodes.add(allocator->getTag()); } obtainNewTimestampPacketNodes(1, timestampPacketDependencies.previousEnqueueNodes, clearAllDependencies, bcsCsr); csrDeps.timestampPacketContainer.push_back(×tampPacketDependencies.previousEnqueueNodes); if (eventBuilder.getEvent()) { eventBuilder.getEvent()->addTimestampPacketNodes(*timestampPacketContainer); } CompletionStamp completionStamp = {CompletionStamp::notReady, taskLevel, 0}; const EnqueueProperties enqueueProperties(true, false, false, false, false, &blitPropertiesContainer); LinearStream *gpgpuCommandStream = {}; size_t gpgpuCommandStreamStart = {}; if (gpgpuSubmission) { if (DebugManager.flags.ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission.get() == 1) { commandStreamReceiverOwnership = getGpgpuCommandStreamReceiver().obtainUniqueOwnership(); } gpgpuCommandStream = obtainCommandStream(csrDeps, true, blockQueue, multiDispatchInfo, eventsRequest, blockedCommandsData, nullptr, 0, false); gpgpuCommandStreamStart = gpgpuCommandStream->getUsed(); } blitPropertiesContainer.push_back(processDispatchForBlitEnqueue(bcsCsr, multiDispatchInfo, timestampPacketDependencies, eventsRequest, gpgpuCommandStream, cmdType, blockQueue)); if (!blockQueue) { completionStamp = enqueueCommandWithoutKernel(nullptr, 0, gpgpuCommandStream, gpgpuCommandStreamStart, blocking, enqueueProperties, timestampPacketDependencies, eventsRequest, eventBuilder, taskLevel, csrDeps, &bcsCsr); if (completionStamp.taskCount == CompletionStamp::gpuHang) { return CL_OUT_OF_RESOURCES; } if (gpgpuSubmission) { if (DebugManager.flags.ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission.get() == 1) { commandStreamReceiverOwnership.unlock(); } } if (eventBuilder.getEvent()) { eventBuilder.getEvent()->flushStamp->replaceStampObject(this->flushStamp->getStampReference()); } this->latestSentEnqueueType = enqueueProperties.operation; setLastBcsPacket(bcsCsr.getOsContext().getEngineType()); } updateFromCompletionStamp(completionStamp, eventBuilder.getEvent()); if (blockQueue) { enqueueBlocked(cmdType, nullptr, 0, multiDispatchInfo, timestampPacketDependencies, blockedCommandsData, enqueueProperties, eventsRequest, eventBuilder, nullptr, &bcsCsr); if (gpgpuSubmission) { if (DebugManager.flags.ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission.get() == 1) { commandStreamReceiverOwnership.unlock(); } } } timestampPacketDependencies.moveNodesToNewContainer(*deferredTimestampPackets); csrDeps.copyNodesToNewContainer(*deferredTimestampPackets); if (DebugManager.flags.ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission.get() != 1) { commandStreamReceiverOwnership.unlock(); } queueOwnership.unlock(); bcsCommandStreamReceiverOwnership.unlock(); if (blocking) { const auto waitStatus = waitForAllEngines(blockQueue, nullptr); if (waitStatus == WaitStatus::GpuHang) { return CL_OUT_OF_RESOURCES; } } return CL_SUCCESS; } template template cl_int CommandQueueHw::dispatchBcsOrGpgpuEnqueue(MultiDispatchInfo &dispatchInfo, Surface *(&surfaces)[surfaceCount], EBuiltInOps::Type builtInOperation, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event, bool blocking, CommandStreamReceiver &csr) { const bool blit = EngineHelpers::isBcs(csr.getOsContext().getEngineType()); if (blit) { cl_int ret = CL_SUCCESS; if (dispatchInfo.peekBuiltinOpParams().bcsSplit) { ret = enqueueBlitSplit(dispatchInfo, numEventsInWaitList, eventWaitList, event, blocking, csr); } else { ret = enqueueBlit(dispatchInfo, numEventsInWaitList, eventWaitList, event, blocking, csr); } return ret; } else { auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(builtInOperation, this->getClDevice()); BuiltInOwnershipWrapper builtInLock(builder, this->context); builder.buildDispatchInfos(dispatchInfo); return enqueueHandler( surfaces, blocking, dispatchInfo, numEventsInWaitList, eventWaitList, event); } } template bool CommandQueueHw::isBlitAuxTranslationRequired(const MultiDispatchInfo &multiDispatchInfo) { return multiDispatchInfo.getKernelObjsForAuxTranslation() && (multiDispatchInfo.getKernelObjsForAuxTranslation()->size() > 0) && (HwHelperHw::get().getAuxTranslationMode(device->getHardwareInfo()) == AuxTranslationMode::Blit); } } // namespace NEO