/* * Copyright (C) 2017-2020 Intel Corporation * * SPDX-License-Identifier: MIT * */ #pragma once #include "shared/source/built_ins/built_ins.h" #include "shared/source/command_stream/command_stream_receiver.h" #include "shared/source/helpers/array_count.h" #include "shared/source/helpers/engine_node_helper.h" #include "shared/source/memory_manager/internal_allocation_storage.h" #include "shared/source/memory_manager/memory_manager.h" #include "shared/source/memory_manager/surface.h" #include "shared/source/os_interface/os_context.h" #include "shared/source/program/sync_buffer_handler.h" #include "shared/source/utilities/range.h" #include "shared/source/utilities/tag_allocator.h" #include "opencl/source/built_ins/builtins_dispatch_builder.h" #include "opencl/source/builtin_kernels_simulation/scheduler_simulation.h" #include "opencl/source/command_queue/command_queue_hw.h" #include "opencl/source/command_queue/gpgpu_walker.h" #include "opencl/source/command_queue/hardware_interface.h" #include "opencl/source/event/event_builder.h" #include "opencl/source/event/user_event.h" #include "opencl/source/gtpin/gtpin_notify.h" #include "opencl/source/helpers/cl_blit_properties.h" #include "opencl/source/helpers/dispatch_info_builder.h" #include "opencl/source/helpers/enqueue_properties.h" #include "opencl/source/helpers/hardware_commands_helper.h" #include "opencl/source/helpers/task_information.h" #include "opencl/source/mem_obj/buffer.h" #include "opencl/source/mem_obj/image.h" #include "opencl/source/program/block_kernel_manager.h" #include "opencl/source/program/printf_handler.h" #include "pipe_control_args.h" #include #include namespace NEO { template template void CommandQueueHw::enqueueHandler(Surface *(&surfaces)[surfaceCount], bool blocking, Kernel *kernel, cl_uint workDim, const size_t globalOffsets[3], const size_t workItems[3], const size_t *localWorkSizesIn, const size_t *enqueuedWorkSizes, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) { BuiltInOwnershipWrapper builtInLock; MemObjsForAuxTranslation memObjsForAuxTranslation; MultiDispatchInfo multiDispatchInfo(kernel); if (DebugManager.flags.ForceDispatchScheduler.get()) { forceDispatchScheduler(multiDispatchInfo); } else { if (kernel->isAuxTranslationRequired()) { auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(EBuiltInOps::AuxTranslation, getDevice()); builtInLock.takeOwnership(builder, this->context); kernel->fillWithBuffersForAuxTranslation(memObjsForAuxTranslation); multiDispatchInfo.setMemObjsForAuxTranslation(memObjsForAuxTranslation); if (!memObjsForAuxTranslation.empty()) { dispatchAuxTranslationBuiltin(multiDispatchInfo, AuxTranslationDirection::AuxToNonAux); } } if (kernel->getKernelInfo().builtinDispatchBuilder == nullptr) { DispatchInfoBuilder builder; builder.setDispatchGeometry(workDim, workItems, enqueuedWorkSizes, globalOffsets, Vec3{0, 0, 0}, localWorkSizesIn); builder.setKernel(kernel); builder.bake(multiDispatchInfo); } else { auto builder = kernel->getKernelInfo().builtinDispatchBuilder; builder->buildDispatchInfos(multiDispatchInfo, kernel, workDim, workItems, enqueuedWorkSizes, globalOffsets); if (multiDispatchInfo.size() == 0) { return; } } if (kernel->isAuxTranslationRequired()) { if (!memObjsForAuxTranslation.empty()) { UNRECOVERABLE_IF(kernel->isParentKernel); dispatchAuxTranslationBuiltin(multiDispatchInfo, AuxTranslationDirection::NonAuxToAux); } } } if (HwHelperHw::isBlitAuxTranslationRequired(device->getHardwareInfo(), multiDispatchInfo)) { setupBlitAuxTranslation(multiDispatchInfo); } enqueueHandler(surfaces, blocking, multiDispatchInfo, numEventsInWaitList, eventWaitList, event); } template void CommandQueueHw::forceDispatchScheduler(NEO::MultiDispatchInfo &multiDispatchInfo) { SchedulerKernel &scheduler = getContext().getSchedulerKernel(); DispatchInfo dispatchInfo(&scheduler, 1, Vec3(scheduler.getGws(), 1, 1), Vec3(scheduler.getLws(), 1, 1), Vec3(0, 0, 0)); auto devQueue = this->getContext().getDefaultDeviceQueue(); DeviceQueueHw *devQueueHw = castToObjectOrAbort>(devQueue); scheduler.createReflectionSurface(); GraphicsAllocation *reflectionSurface = scheduler.getKernelReflectionSurface(); devQueueHw->resetDeviceQueue(); scheduler.setArgs(devQueueHw->getQueueBuffer(), devQueueHw->getStackBuffer(), devQueueHw->getEventPoolBuffer(), devQueueHw->getSlbBuffer(), devQueueHw->getDshBuffer(), reflectionSurface, devQueueHw->getQueueStorageBuffer(), this->getIndirectHeap(IndirectHeap::SURFACE_STATE, 0u).getGraphicsAllocation()); multiDispatchInfo.push(dispatchInfo); } template template void CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, size_t numSurfaceForResidency, bool blocking, const MultiDispatchInfo &multiDispatchInfo, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) { if (multiDispatchInfo.empty() && !isCommandWithoutKernel(commandType)) { enqueueHandler(surfacesForResidency, numSurfaceForResidency, blocking, multiDispatchInfo, numEventsInWaitList, eventWaitList, event); if (event) { castToObjectOrAbort(*event)->setCmdType(commandType); } return; } Kernel *parentKernel = multiDispatchInfo.peekParentKernel(); auto devQueue = this->getContext().getDefaultDeviceQueue(); DeviceQueueHw *devQueueHw = castToObject>(devQueue); auto clearAllDependencies = queueDependenciesClearRequired(); TagNode *hwTimeStamps = nullptr; auto commandStreamRecieverOwnership = getGpgpuCommandStreamReceiver().obtainUniqueOwnership(); TimeStampData queueTimeStamp; if (isProfilingEnabled() && event) { this->getDevice().getOSTime()->getCpuGpuTime(&queueTimeStamp); } EventBuilder eventBuilder; if (event) { eventBuilder.create(this, commandType, CompletionStamp::levelNotReady, 0); *event = eventBuilder.getEvent(); if (eventBuilder.getEvent()->isProfilingEnabled()) { eventBuilder.getEvent()->setQueueTimeStamp(&queueTimeStamp); if (isCommandWithoutKernel(commandType)) { eventBuilder.getEvent()->setCPUProfilingPath(true); eventBuilder.getEvent()->setQueueTimeStamp(); } } DBG_LOG(EventsDebugEnable, "enqueueHandler commandType", commandType, "output Event", eventBuilder.getEvent()); } std::unique_ptr blockedCommandsData; std::unique_ptr printfHandler; TakeOwnershipWrapper> queueOwnership(*this); auto blockQueue = false; auto taskLevel = 0u; obtainTaskLevelAndBlockedStatus(taskLevel, numEventsInWaitList, eventWaitList, blockQueue, commandType); bool blitEnqueue = blitEnqueueAllowed(commandType); DBG_LOG(EventsDebugEnable, "blockQueue", blockQueue, "virtualEvent", virtualEvent, "taskLevel", taskLevel); if (parentKernel && !blockQueue) { while (!devQueueHw->isEMCriticalSectionFree()) ; } enqueueHandlerHook(commandType, multiDispatchInfo); aubCaptureHook(blocking, clearAllDependencies, multiDispatchInfo); if (DebugManager.flags.MakeEachEnqueueBlocking.get()) { blocking = true; } TimestampPacketDependencies timestampPacketDependencies; EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, event); CsrDependencies csrDeps; BlitPropertiesContainer blitPropertiesContainer; bool enqueueWithBlitAuxTranslation = HwHelperHw::isBlitAuxTranslationRequired(device->getHardwareInfo(), multiDispatchInfo); if (blitEnqueue || getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { eventsRequest.fillCsrDependencies(csrDeps, getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr); auto allocator = getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(); size_t nodesCount = 0u; if (blitEnqueue || isCacheFlushCommand(commandType)) { nodesCount = 1; } else if (!multiDispatchInfo.empty()) { nodesCount = estimateTimestampPacketNodesCount(multiDispatchInfo); } if (isCacheFlushForBcsRequired() && (blitEnqueue || enqueueWithBlitAuxTranslation)) { timestampPacketDependencies.cacheFlushNodes.add(allocator->getTag()); } if (blitEnqueue && !blockQueue && getGpgpuCommandStreamReceiver().isStallingPipeControlOnNextFlushRequired()) { timestampPacketDependencies.barrierNodes.add(allocator->getTag()); } if (nodesCount > 0) { obtainNewTimestampPacketNodes(nodesCount, timestampPacketDependencies.previousEnqueueNodes, clearAllDependencies); csrDeps.push_back(×tampPacketDependencies.previousEnqueueNodes); } } auto &commandStream = *obtainCommandStream(csrDeps, blitEnqueue, blockQueue, multiDispatchInfo, eventsRequest, blockedCommandsData, surfacesForResidency, numSurfaceForResidency); auto commandStreamStart = commandStream.getUsed(); if (HwHelperHw::isBlitAuxTranslationRequired(device->getHardwareInfo(), multiDispatchInfo)) { processDispatchForBlitAuxTranslation(multiDispatchInfo, blitPropertiesContainer, timestampPacketDependencies, eventsRequest, blockQueue); } if (eventBuilder.getEvent() && getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { eventBuilder.getEvent()->addTimestampPacketNodes(*timestampPacketContainer); eventBuilder.getEvent()->addTimestampPacketNodes(timestampPacketDependencies.nonAuxToAuxNodes); } bool flushDependenciesForNonKernelCommand = false; if (blitEnqueue) { blitPropertiesContainer.push_back(processDispatchForBlitEnqueue(multiDispatchInfo, timestampPacketDependencies, eventsRequest, commandStream, commandType, blockQueue)); } else if (multiDispatchInfo.empty() == false) { processDispatchForKernels(multiDispatchInfo, printfHandler, eventBuilder.getEvent(), hwTimeStamps, blockQueue, devQueueHw, csrDeps, blockedCommandsData.get(), timestampPacketDependencies); } else if (isCacheFlushCommand(commandType)) { processDispatchForCacheFlush(surfacesForResidency, numSurfaceForResidency, &commandStream, csrDeps); } else if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { if (CL_COMMAND_BARRIER == commandType) { getGpgpuCommandStreamReceiver().requestStallingPipeControlOnNextFlush(); } for (size_t i = 0; i < eventsRequest.numEventsInWaitList; i++) { auto waitlistEvent = castToObjectOrAbort(eventsRequest.eventWaitList[i]); if (waitlistEvent->getTimestampPacketNodes()) { flushDependenciesForNonKernelCommand = true; if (eventBuilder.getEvent()) { eventBuilder.getEvent()->addTimestampPacketNodes(*waitlistEvent->getTimestampPacketNodes()); } } } if (flushDependenciesForNonKernelCommand) { TimestampPacketHelper::programCsrDependencies(commandStream, csrDeps, getGpgpuCommandStreamReceiver().getOsContext().getNumSupportedDevices()); } } CompletionStamp completionStamp = {CompletionStamp::levelNotReady, taskLevel, 0}; const EnqueueProperties enqueueProperties(blitEnqueue, !multiDispatchInfo.empty(), isCacheFlushCommand(commandType), flushDependenciesForNonKernelCommand, &blitPropertiesContainer); if (!blockQueue) { csrDeps.makeResident(getGpgpuCommandStreamReceiver()); if (parentKernel) { processDeviceEnqueue(devQueueHw, multiDispatchInfo, hwTimeStamps, blocking); } if (enqueueProperties.operation == EnqueueProperties::Operation::GpuKernel) { completionStamp = enqueueNonBlocked( surfacesForResidency, numSurfaceForResidency, commandStream, commandStreamStart, blocking, multiDispatchInfo, enqueueProperties, timestampPacketDependencies, eventsRequest, eventBuilder, taskLevel, printfHandler.get()); if (parentKernel) { getGpgpuCommandStreamReceiver().setMediaVFEStateDirty(true); if (devQueueHw->getSchedulerReturnInstance() > 0) { waitUntilComplete(completionStamp.taskCount, completionStamp.flushStamp, false); this->runSchedulerSimulation(*devQueueHw, *parentKernel); } } } else if (enqueueProperties.isFlushWithoutKernelRequired()) { completionStamp = enqueueCommandWithoutKernel( surfacesForResidency, numSurfaceForResidency, commandStream, commandStreamStart, blocking, enqueueProperties, timestampPacketDependencies, eventsRequest, eventBuilder, taskLevel); } else { UNRECOVERABLE_IF(enqueueProperties.operation != EnqueueProperties::Operation::EnqueueWithoutSubmission); auto maxTaskCount = this->taskCount; for (auto eventId = 0u; eventId < numEventsInWaitList; eventId++) { auto event = castToObject(eventWaitList[eventId]); if (!event->isUserEvent() && !event->isExternallySynchronized()) { maxTaskCount = std::max(maxTaskCount, event->peekTaskCount()); } } //inherit data from event_wait_list and previous packets completionStamp.flushStamp = this->flushStamp->peekStamp(); completionStamp.taskCount = maxTaskCount; completionStamp.taskLevel = taskLevel; if (eventBuilder.getEvent() && isProfilingEnabled()) { TimeStampData submitTimeStamp; this->getDevice().getOSTime()->getCpuGpuTime(&submitTimeStamp); eventBuilder.getEvent()->setSubmitTimeStamp(&submitTimeStamp); eventBuilder.getEvent()->setSubmitTimeStamp(); eventBuilder.getEvent()->setStartTimeStamp(); } } if (eventBuilder.getEvent()) { eventBuilder.getEvent()->flushStamp->replaceStampObject(this->flushStamp->getStampReference()); } } updateFromCompletionStamp(completionStamp); if (eventBuilder.getEvent()) { eventBuilder.getEvent()->updateCompletionStamp(completionStamp.taskCount, completionStamp.taskLevel, completionStamp.flushStamp); FileLoggerInstance().log(DebugManager.flags.EventsDebugEnable.get(), "updateCompletionStamp Event", eventBuilder.getEvent(), "taskLevel", eventBuilder.getEvent()->taskLevel.load()); } if (blockQueue) { if (parentKernel) { size_t minSizeSSHForEM = HardwareCommandsHelper::getSshSizeForExecutionModel(*parentKernel); blockedCommandsData->surfaceStateHeapSizeEM = minSizeSSHForEM; } enqueueBlocked(commandType, surfacesForResidency, numSurfaceForResidency, multiDispatchInfo, timestampPacketDependencies, blockedCommandsData, enqueueProperties, eventsRequest, eventBuilder, std::move(printfHandler)); } queueOwnership.unlock(); commandStreamRecieverOwnership.unlock(); if (blocking) { if (blockQueue) { while (isQueueBlocked()) { } waitUntilComplete(taskCount, flushStamp->peekStamp(), false); } else { waitUntilComplete(taskCount, flushStamp->peekStamp(), false); if (printfHandler) { printfHandler->printEnqueueOutput(); } } } } template template void CommandQueueHw::processDispatchForKernels(const MultiDispatchInfo &multiDispatchInfo, std::unique_ptr &printfHandler, Event *event, TagNode *&hwTimeStamps, bool blockQueue, DeviceQueueHw *devQueueHw, CsrDependencies &csrDeps, KernelOperation *blockedCommandsData, TimestampPacketDependencies ×tampPacketDependencies) { TagNode *hwPerfCounter = nullptr; FileLoggerInstance().dumpKernelArgs(&multiDispatchInfo); printfHandler.reset(PrintfHandler::create(multiDispatchInfo, *device)); if (printfHandler) { printfHandler->prepareDispatch(multiDispatchInfo); } if (multiDispatchInfo.peekMainKernel()->usesSyncBuffer()) { auto &gws = multiDispatchInfo.begin()->getGWS(); auto &lws = multiDispatchInfo.begin()->getLocalWorkgroupSize(); size_t workGroupsCount = (gws.x * gws.y * gws.z) / (lws.x * lws.y * lws.z); device->syncBufferHandler->prepareForEnqueue(workGroupsCount, *multiDispatchInfo.peekMainKernel()); } if (commandType == CL_COMMAND_NDRANGE_KERNEL) { if (multiDispatchInfo.peekMainKernel()->getProgram()->isKernelDebugEnabled()) { setupDebugSurface(multiDispatchInfo.peekMainKernel()); } } if (event && this->isProfilingEnabled()) { // Get allocation for timestamps hwTimeStamps = event->getHwTimeStampNode(); } if (auto parentKernel = multiDispatchInfo.peekParentKernel()) { parentKernel->createReflectionSurface(); parentKernel->patchDefaultDeviceQueue(context->getDefaultDeviceQueue()); parentKernel->patchEventPool(context->getDefaultDeviceQueue()); parentKernel->patchReflectionSurface(context->getDefaultDeviceQueue(), printfHandler.get()); if (!blockQueue) { devQueueHw->resetDeviceQueue(); devQueueHw->acquireEMCriticalSection(); } } if (event && this->isPerfCountersEnabled()) { hwPerfCounter = event->getHwPerfCounterNode(); } HardwareInterface::dispatchWalker( *this, multiDispatchInfo, csrDeps, blockedCommandsData, hwTimeStamps, hwPerfCounter, ×tampPacketDependencies, timestampPacketContainer.get(), commandType); if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) { for (auto &dispatchInfo : multiDispatchInfo) { for (auto &patchInfoData : dispatchInfo.getKernel()->getPatchInfoDataList()) { getGpgpuCommandStreamReceiver().getFlatBatchBufferHelper().setPatchInfoData(patchInfoData); } } } getGpgpuCommandStreamReceiver().setRequiredScratchSizes(multiDispatchInfo.getRequiredScratchSize(), multiDispatchInfo.getRequiredPrivateScratchSize()); } template BlitProperties CommandQueueHw::processDispatchForBlitEnqueue(const MultiDispatchInfo &multiDispatchInfo, TimestampPacketDependencies ×tampPacketDependencies, const EventsRequest &eventsRequest, LinearStream &commandStream, uint32_t commandType, bool queueBlocked) { auto blitDirection = ClBlitProperties::obtainBlitDirection(commandType); auto blitCommandStreamReceiver = getBcsCommandStreamReceiver(); auto blitProperties = ClBlitProperties::constructProperties(blitDirection, *blitCommandStreamReceiver, multiDispatchInfo.peekBuiltinOpParams()); if (!queueBlocked) { eventsRequest.fillCsrDependencies(blitProperties.csrDependencies, *blitCommandStreamReceiver, CsrDependencies::DependenciesType::All); blitProperties.csrDependencies.push_back(×tampPacketDependencies.cacheFlushNodes); blitProperties.csrDependencies.push_back(×tampPacketDependencies.previousEnqueueNodes); blitProperties.csrDependencies.push_back(×tampPacketDependencies.barrierNodes); } auto currentTimestampPacketNode = timestampPacketContainer->peekNodes().at(0); blitProperties.outputTimestampPacket = currentTimestampPacketNode; if (isCacheFlushForBcsRequired()) { auto cacheFlushTimestampPacketGpuAddress = TimestampPacketHelper::getContextEndGpuAddress(*timestampPacketDependencies.cacheFlushNodes.peekNodes()[0]); PipeControlArgs args(true); MemorySynchronizationCommands::addPipeControlAndProgramPostSyncOperation( commandStream, GfxFamily::PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, cacheFlushTimestampPacketGpuAddress, 0, device->getHardwareInfo(), args); } TimestampPacketHelper::programSemaphoreWithImplicitDependency(commandStream, *currentTimestampPacketNode, getGpgpuCommandStreamReceiver().getOsContext().getNumSupportedDevices()); return blitProperties; } template void CommandQueueHw::processDispatchForBlitAuxTranslation(const MultiDispatchInfo &multiDispatchInfo, BlitPropertiesContainer &blitPropertiesContainer, TimestampPacketDependencies ×tampPacketDependencies, const EventsRequest &eventsRequest, bool queueBlocked) { auto nodesAllocator = getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(); auto numBuffers = multiDispatchInfo.getMemObjsForAuxTranslation()->size(); blitPropertiesContainer.resize(numBuffers * 2); auto bufferIndex = 0; for (auto &buffer : *multiDispatchInfo.getMemObjsForAuxTranslation()) { { // Aux to NonAux blitPropertiesContainer[bufferIndex] = BlitProperties::constructPropertiesForAuxTranslation(AuxTranslationDirection::AuxToNonAux, buffer->getGraphicsAllocation()); auto auxToNonAuxNode = nodesAllocator->getTag(); timestampPacketDependencies.auxToNonAuxNodes.add(auxToNonAuxNode); } { // NonAux to Aux blitPropertiesContainer[bufferIndex + numBuffers] = BlitProperties::constructPropertiesForAuxTranslation(AuxTranslationDirection::NonAuxToAux, buffer->getGraphicsAllocation()); auto nonAuxToAuxNode = nodesAllocator->getTag(); timestampPacketDependencies.nonAuxToAuxNodes.add(nonAuxToAuxNode); } bufferIndex++; } if (!queueBlocked) { CsrDependencies csrDeps; eventsRequest.fillCsrDependencies(csrDeps, *getBcsCommandStreamReceiver(), CsrDependencies::DependenciesType::All); BlitProperties::setupDependenciesForAuxTranslation(blitPropertiesContainer, timestampPacketDependencies, *this->timestampPacketContainer, csrDeps, getGpgpuCommandStreamReceiver(), *getBcsCommandStreamReceiver()); } } template void CommandQueueHw::processDispatchForCacheFlush(Surface **surfaces, size_t numSurfaces, LinearStream *commandStream, CsrDependencies &csrDeps) { TimestampPacketHelper::programCsrDependencies(*commandStream, csrDeps, getGpgpuCommandStreamReceiver().getOsContext().getNumSupportedDevices()); uint64_t postSyncAddress = 0; if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { auto timestampPacketNodeForPostSync = timestampPacketContainer->peekNodes().at(0); postSyncAddress = TimestampPacketHelper::getContextEndGpuAddress(*timestampPacketNodeForPostSync); } submitCacheFlush(surfaces, numSurfaces, commandStream, postSyncAddress); } template void CommandQueueHw::processDeviceEnqueue(DeviceQueueHw *devQueueHw, const MultiDispatchInfo &multiDispatchInfo, TagNode *hwTimeStamps, bool &blocking) { auto parentKernel = multiDispatchInfo.peekParentKernel(); size_t minSizeSSHForEM = HardwareCommandsHelper::getSshSizeForExecutionModel(*parentKernel); bool isCcsUsed = EngineHelpers::isCcs(gpgpuEngine->osContext->getEngineType()); uint32_t taskCount = getGpgpuCommandStreamReceiver().peekTaskCount() + 1; devQueueHw->setupExecutionModelDispatch(getIndirectHeap(IndirectHeap::SURFACE_STATE, minSizeSSHForEM), *devQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE), parentKernel, (uint32_t)multiDispatchInfo.size(), getGpgpuCommandStreamReceiver().getTagAllocation()->getGpuAddress(), taskCount, hwTimeStamps, isCcsUsed); SchedulerKernel &scheduler = getContext().getSchedulerKernel(); scheduler.setArgs(devQueueHw->getQueueBuffer(), devQueueHw->getStackBuffer(), devQueueHw->getEventPoolBuffer(), devQueueHw->getSlbBuffer(), devQueueHw->getDshBuffer(), parentKernel->getKernelReflectionSurface(), devQueueHw->getQueueStorageBuffer(), this->getIndirectHeap(IndirectHeap::SURFACE_STATE, 0u).getGraphicsAllocation(), devQueueHw->getDebugQueue()); auto preemptionMode = PreemptionHelper::taskPreemptionMode(getDevice(), multiDispatchInfo); GpgpuWalkerHelper::dispatchScheduler( *this->commandStream, *devQueueHw, preemptionMode, scheduler, &getIndirectHeap(IndirectHeap::SURFACE_STATE, 0u), devQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE), isCcsUsed); scheduler.makeResident(getGpgpuCommandStreamReceiver()); parentKernel->getProgram()->getBlockKernelManager()->makeInternalAllocationsResident(getGpgpuCommandStreamReceiver()); if (parentKernel->isAuxTranslationRequired()) { blocking = true; } } template void CommandQueueHw::obtainTaskLevelAndBlockedStatus(unsigned int &taskLevel, cl_uint &numEventsInWaitList, const cl_event *&eventWaitList, bool &blockQueueStatus, unsigned int commandType) { auto isQueueBlockedStatus = isQueueBlocked(); taskLevel = getTaskLevelFromWaitList(this->taskLevel, numEventsInWaitList, eventWaitList); blockQueueStatus = (taskLevel == CompletionStamp::levelNotReady) || isQueueBlockedStatus; auto taskLevelUpdateRequired = isTaskLevelUpdateRequired(taskLevel, eventWaitList, numEventsInWaitList, commandType); if (taskLevelUpdateRequired) { taskLevel++; this->taskLevel = taskLevel; } } template bool CommandQueueHw::isTaskLevelUpdateRequired(const uint32_t &taskLevel, const cl_event *eventWaitList, const cl_uint &numEventsInWaitList, unsigned int commandType) { bool updateTaskLevel = true; //if we are blocked by user event then no update if (taskLevel == CompletionStamp::levelNotReady) { updateTaskLevel = false; } //if we are executing command without kernel then it will inherit state from //previous commands, barrier is exception if (isCommandWithoutKernel(commandType) && commandType != CL_COMMAND_BARRIER) { updateTaskLevel = false; } //ooq special cases starts here if (this->isOOQEnabled()) { //if no wait list and barrier , do not update task level if (eventWaitList == nullptr && commandType != CL_COMMAND_BARRIER) { updateTaskLevel = false; } //if we have waitlist then deduce task level from waitlist and check if it is higher then current task level of queue if (eventWaitList != nullptr) { auto taskLevelFromEvents = getTaskLevelFromWaitList(0, numEventsInWaitList, eventWaitList); taskLevelFromEvents++; if (taskLevelFromEvents <= this->taskLevel) { updateTaskLevel = false; } } } return updateTaskLevel; } template template CompletionStamp CommandQueueHw::enqueueNonBlocked( Surface **surfaces, size_t surfaceCount, LinearStream &commandStream, size_t commandStreamStart, bool &blocking, const MultiDispatchInfo &multiDispatchInfo, const EnqueueProperties &enqueueProperties, TimestampPacketDependencies ×tampPacketDependencies, EventsRequest &eventsRequest, EventBuilder &eventBuilder, uint32_t taskLevel, PrintfHandler *printfHandler) { UNRECOVERABLE_IF(multiDispatchInfo.empty()); auto implicitFlush = false; if (printfHandler) { blocking = true; printfHandler->makeResident(getGpgpuCommandStreamReceiver()); } if (multiDispatchInfo.peekMainKernel()->usesSyncBuffer()) { device->syncBufferHandler->makeResident(getGpgpuCommandStreamReceiver()); } if (timestampPacketContainer) { timestampPacketContainer->makeResident(getGpgpuCommandStreamReceiver()); timestampPacketDependencies.previousEnqueueNodes.makeResident(getGpgpuCommandStreamReceiver()); timestampPacketDependencies.cacheFlushNodes.makeResident(getGpgpuCommandStreamReceiver()); } bool anyUncacheableArgs = false; auto requiresCoherency = false; for (auto surface : CreateRange(surfaces, surfaceCount)) { surface->makeResident(getGpgpuCommandStreamReceiver()); requiresCoherency |= surface->IsCoherent; if (!surface->allowsL3Caching()) { anyUncacheableArgs = true; } } auto mediaSamplerRequired = false; uint32_t numGrfRequired = GrfConfig::DefaultGrfNumber; auto specialPipelineSelectMode = false; Kernel *kernel = nullptr; bool usePerDssBackedBuffer = false; for (auto &dispatchInfo : multiDispatchInfo) { if (kernel != dispatchInfo.getKernel()) { kernel = dispatchInfo.getKernel(); } else { continue; } kernel->makeResident(getGpgpuCommandStreamReceiver()); requiresCoherency |= kernel->requiresCoherency(); mediaSamplerRequired |= kernel->isVmeKernel(); auto numGrfRequiredByKernel = kernel->getKernelInfo().patchInfo.executionEnvironment->NumGRFRequired; numGrfRequired = std::max(numGrfRequired, numGrfRequiredByKernel); specialPipelineSelectMode |= kernel->requiresSpecialPipelineSelectMode(); if (kernel->hasUncacheableStatelessArgs()) { anyUncacheableArgs = true; } if (kernel->requiresPerDssBackedBuffer()) { usePerDssBackedBuffer = true; } } if (mediaSamplerRequired) { DEBUG_BREAK_IF(device->getDeviceInfo().preemptionSupported != false); } TimeStampData submitTimeStamp; if (isProfilingEnabled() && eventBuilder.getEvent()) { this->getDevice().getOSTime()->getCpuGpuTime(&submitTimeStamp); eventBuilder.getEvent()->setSubmitTimeStamp(&submitTimeStamp); getGpgpuCommandStreamReceiver().makeResident(*eventBuilder.getEvent()->getHwTimeStampNode()->getBaseGraphicsAllocation()); if (isPerfCountersEnabled()) { getGpgpuCommandStreamReceiver().makeResident(*eventBuilder.getEvent()->getHwPerfCounterNode()->getBaseGraphicsAllocation()); } } IndirectHeap *dsh = nullptr; IndirectHeap *ioh = nullptr; if (multiDispatchInfo.peekParentKernel()) { DeviceQueueHw *pDevQueue = castToObject>(this->getContext().getDefaultDeviceQueue()); DEBUG_BREAK_IF(pDevQueue == nullptr); dsh = pDevQueue->getIndirectHeap(IndirectHeap::DYNAMIC_STATE); // In ExecutionModel IOH is the same as DSH to eliminate StateBaseAddress reprogramming for scheduler kernel and blocks. ioh = dsh; implicitFlush = true; } else { dsh = &getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 0u); ioh = &getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 0u); } auto allocNeedsFlushDC = false; if (!device->isFullRangeSvm()) { if (std::any_of(getGpgpuCommandStreamReceiver().getResidencyAllocations().begin(), getGpgpuCommandStreamReceiver().getResidencyAllocations().end(), [](const auto allocation) { return allocation->isFlushL3Required(); })) { allocNeedsFlushDC = true; } } DispatchFlags dispatchFlags( {}, //csrDependencies ×tampPacketDependencies.barrierNodes, //barrierTimestampPacketNodes {}, //pipelineSelectArgs this->flushStamp->getStampReference(), //flushStampReference getThrottle(), //throttle PreemptionHelper::taskPreemptionMode(getDevice(), multiDispatchInfo), //preemptionMode numGrfRequired, //numGrfRequired L3CachingSettings::l3CacheOn, //l3CacheSettings kernel->getThreadArbitrationPolicy(), //threadArbitrationPolicy getSliceCount(), //sliceCount blocking, //blocking shouldFlushDC(commandType, printfHandler) || allocNeedsFlushDC, //dcFlush multiDispatchInfo.usesSlm() || multiDispatchInfo.peekParentKernel(), //useSLM true, //guardCommandBufferWithPipeControl commandType == CL_COMMAND_NDRANGE_KERNEL, //GSBA32BitRequired requiresCoherency, //requiresCoherency (QueuePriority::LOW == priority), //lowPriority implicitFlush, //implicitFlush !eventBuilder.getEvent() || getGpgpuCommandStreamReceiver().isNTo1SubmissionModelEnabled(), //outOfOrderExecutionAllowed false, //epilogueRequired usePerDssBackedBuffer //usePerDssBackedBuffer ); dispatchFlags.pipelineSelectArgs.mediaSamplerRequired = mediaSamplerRequired; dispatchFlags.pipelineSelectArgs.specialPipelineSelectMode = specialPipelineSelectMode; if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { eventsRequest.fillCsrDependencies(dispatchFlags.csrDependencies, getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OutOfCsr); dispatchFlags.csrDependencies.makeResident(getGpgpuCommandStreamReceiver()); } DEBUG_BREAK_IF(taskLevel >= CompletionStamp::levelNotReady); if (anyUncacheableArgs) { dispatchFlags.l3CacheSettings = L3CachingSettings::l3CacheOff; } else if (!kernel->areStatelessWritesUsed()) { dispatchFlags.l3CacheSettings = L3CachingSettings::l3AndL1On; } if (this->dispatchHints != 0) { dispatchFlags.engineHints = this->dispatchHints; dispatchFlags.epilogueRequired = true; } if (gtpinIsGTPinInitialized()) { gtpinNotifyPreFlushTask(this); } if (enqueueProperties.blitPropertiesContainer->size() > 0) { this->bcsTaskCount = getBcsCommandStreamReceiver()->blitBuffer(*enqueueProperties.blitPropertiesContainer, false, this->isProfilingEnabled()); dispatchFlags.implicitFlush = true; } printDebugString(DebugManager.flags.PrintDebugMessages.get(), stdout, "preemption = %d.\n", static_cast(dispatchFlags.preemptionMode)); CompletionStamp completionStamp = getGpgpuCommandStreamReceiver().flushTask( commandStream, commandStreamStart, *dsh, *ioh, getIndirectHeap(IndirectHeap::SURFACE_STATE, 0u), taskLevel, dispatchFlags, getDevice()); if (gtpinIsGTPinInitialized()) { gtpinNotifyFlushTask(completionStamp.taskCount); } return completionStamp; } template void CommandQueueHw::enqueueBlocked( uint32_t commandType, Surface **surfaces, size_t surfaceCount, const MultiDispatchInfo &multiDispatchInfo, TimestampPacketDependencies ×tampPacketDependencies, std::unique_ptr &blockedCommandsData, const EnqueueProperties &enqueueProperties, EventsRequest &eventsRequest, EventBuilder &externalEventBuilder, std::unique_ptr printfHandler) { TakeOwnershipWrapper> queueOwnership(*this); //store previous virtual event as it will add dependecies to new virtual event if (this->virtualEvent) { DBG_LOG(EventsDebugEnable, "enqueueBlocked", "previousVirtualEvent", this->virtualEvent); } EventBuilder internalEventBuilder; EventBuilder *eventBuilder; // check if event will be exposed externally if (externalEventBuilder.getEvent()) { externalEventBuilder.getEvent()->incRefInternal(); eventBuilder = &externalEventBuilder; DBG_LOG(EventsDebugEnable, "enqueueBlocked", "output event as virtualEvent", virtualEvent); } else { // it will be an internal event internalEventBuilder.create(this, context); eventBuilder = &internalEventBuilder; DBG_LOG(EventsDebugEnable, "enqueueBlocked", "new virtualEvent", eventBuilder->getEvent()); } auto outEvent = eventBuilder->getEvent(); //update queue taskCount taskCount = outEvent->getCompletionStamp(); std::unique_ptr command; bool storeTimestampPackets = false; if (blockedCommandsData) { if (enqueueProperties.blitPropertiesContainer) { blockedCommandsData->blitPropertiesContainer = *enqueueProperties.blitPropertiesContainer; blockedCommandsData->blitEnqueue = true; } storeTimestampPackets = (timestampPacketContainer != nullptr); } if (enqueueProperties.operation != EnqueueProperties::Operation::GpuKernel) { command = std::make_unique(*this, blockedCommandsData); } else { //store task data in event std::vector allSurfaces; Kernel *kernel = nullptr; for (auto &dispatchInfo : multiDispatchInfo) { if (kernel != dispatchInfo.getKernel()) { kernel = dispatchInfo.getKernel(); } else { continue; } kernel->getResidency(allSurfaces); } for (auto &surface : CreateRange(surfaces, surfaceCount)) { allSurfaces.push_back(surface->duplicate()); } PreemptionMode preemptionMode = PreemptionHelper::taskPreemptionMode(getDevice(), multiDispatchInfo); bool slmUsed = multiDispatchInfo.usesSlm() || multiDispatchInfo.peekParentKernel(); command = std::make_unique(*this, blockedCommandsData, allSurfaces, shouldFlushDC(commandType, printfHandler.get()), slmUsed, commandType == CL_COMMAND_NDRANGE_KERNEL, std::move(printfHandler), preemptionMode, multiDispatchInfo.peekMainKernel(), (uint32_t)multiDispatchInfo.size()); } if (storeTimestampPackets) { for (cl_uint i = 0; i < eventsRequest.numEventsInWaitList; i++) { auto event = castToObjectOrAbort(eventsRequest.eventWaitList[i]); event->incRefInternal(); } command->setTimestampPacketNode(*timestampPacketContainer, std::move(timestampPacketDependencies)); command->setEventsRequest(eventsRequest); } outEvent->setCommand(std::move(command)); eventBuilder->addParentEvents(ArrayRef(eventsRequest.eventWaitList, eventsRequest.numEventsInWaitList)); eventBuilder->addParentEvent(this->virtualEvent); eventBuilder->finalize(); if (this->virtualEvent) { this->virtualEvent->decRefInternal(); } this->virtualEvent = outEvent; } template CompletionStamp CommandQueueHw::enqueueCommandWithoutKernel( Surface **surfaces, size_t surfaceCount, LinearStream &commandStream, size_t commandStreamStart, bool &blocking, const EnqueueProperties &enqueueProperties, TimestampPacketDependencies ×tampPacketDependencies, EventsRequest &eventsRequest, EventBuilder &eventBuilder, uint32_t taskLevel) { if (timestampPacketContainer) { timestampPacketContainer->makeResident(getGpgpuCommandStreamReceiver()); timestampPacketDependencies.previousEnqueueNodes.makeResident(getGpgpuCommandStreamReceiver()); timestampPacketDependencies.cacheFlushNodes.makeResident(getGpgpuCommandStreamReceiver()); } for (auto surface : CreateRange(surfaces, surfaceCount)) { surface->makeResident(getGpgpuCommandStreamReceiver()); } TimeStampData submitTimeStamp; if (eventBuilder.getEvent() && isProfilingEnabled() && getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { this->getDevice().getOSTime()->getCpuGpuTime(&submitTimeStamp); eventBuilder.getEvent()->setSubmitTimeStamp(&submitTimeStamp); eventBuilder.getEvent()->getTimestampPacketNodes()->makeResident(getGpgpuCommandStreamReceiver()); } if (enqueueProperties.operation == EnqueueProperties::Operation::Blit) { UNRECOVERABLE_IF(!enqueueProperties.blitPropertiesContainer); this->bcsTaskCount = getBcsCommandStreamReceiver()->blitBuffer(*enqueueProperties.blitPropertiesContainer, false, this->isProfilingEnabled()); } DispatchFlags dispatchFlags( {}, //csrDependencies ×tampPacketDependencies.barrierNodes, //barrierTimestampPacketNodes {}, //pipelineSelectArgs flushStamp->getStampReference(), //flushStampReference getThrottle(), //throttle device->getPreemptionMode(), //preemptionMode GrfConfig::DefaultGrfNumber, //numGrfRequired L3CachingSettings::l3CacheOn, //l3CacheSettings ThreadArbitrationPolicy::NotPresent, //threadArbitrationPolicy getSliceCount(), //sliceCount blocking, //blocking false, //dcFlush false, //useSLM true, //guardCommandBufferWithPipeControl false, //GSBA32BitRequired false, //requiresCoherency false, //lowPriority (enqueueProperties.operation == EnqueueProperties::Operation::Blit), //implicitFlush getGpgpuCommandStreamReceiver().isNTo1SubmissionModelEnabled(), //outOfOrderExecutionAllowed false, //epilogueRequired false //usePerDssBackedBuffer ); if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { eventsRequest.fillCsrDependencies(dispatchFlags.csrDependencies, getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OutOfCsr); dispatchFlags.csrDependencies.makeResident(getGpgpuCommandStreamReceiver()); } CompletionStamp completionStamp = getGpgpuCommandStreamReceiver().flushTask( commandStream, commandStreamStart, getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 0u), getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 0u), getIndirectHeap(IndirectHeap::SURFACE_STATE, 0u), taskLevel, dispatchFlags, getDevice()); return completionStamp; } template void CommandQueueHw::computeOffsetsValueForRectCommands(size_t *bufferOffset, size_t *hostOffset, const size_t *bufferOrigin, const size_t *hostOrigin, const size_t *region, size_t bufferRowPitch, size_t bufferSlicePitch, size_t hostRowPitch, size_t hostSlicePitch) { size_t computedBufferRowPitch = bufferRowPitch ? bufferRowPitch : region[0]; size_t computedBufferSlicePitch = bufferSlicePitch ? bufferSlicePitch : region[1] * computedBufferRowPitch; size_t computedHostRowPitch = hostRowPitch ? hostRowPitch : region[0]; size_t computedHostSlicePitch = hostSlicePitch ? hostSlicePitch : region[1] * computedHostRowPitch; *bufferOffset = bufferOrigin[2] * computedBufferSlicePitch + bufferOrigin[1] * computedBufferRowPitch + bufferOrigin[0]; *hostOffset = hostOrigin[2] * computedHostSlicePitch + hostOrigin[1] * computedHostRowPitch + hostOrigin[0]; } template size_t CommandQueueHw::calculateHostPtrSizeForImage(const size_t *region, size_t rowPitch, size_t slicePitch, Image *image) { auto bytesPerPixel = image->getSurfaceFormatInfo().surfaceFormat.ImageElementSizeInBytes; auto dstRowPitch = rowPitch ? rowPitch : region[0] * bytesPerPixel; auto dstSlicePitch = slicePitch ? slicePitch : ((image->getImageDesc().image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY ? 1 : region[1]) * dstRowPitch); return Image::calculateHostPtrSize(region, dstRowPitch, dstSlicePitch, bytesPerPixel, image->getImageDesc().image_type); } } // namespace NEO