/* * Copyright (C) 2018-2023 Intel Corporation * * SPDX-License-Identifier: MIT * */ #pragma once #include "shared/source/command_stream/command_stream_receiver.h" #include "shared/source/command_stream/preemption.h" #include "shared/source/helpers/engine_control.h" #include "shared/source/memory_manager/graphics_allocation.h" #include "shared/source/os_interface/os_context.h" #include "opencl/source/cl_device/cl_device.h" #include "opencl/source/command_queue/command_queue.h" #include "opencl/source/command_queue/csr_selection_args.h" #include "opencl/source/command_queue/gpgpu_walker.h" #include "opencl/source/helpers/dispatch_info.h" #include "opencl/source/helpers/queue_helpers.h" #include "opencl/source/mem_obj/mem_obj.h" #include "opencl/source/program/printf_handler.h" #include namespace NEO { class EventBuilder; struct EnqueueProperties; template class CommandQueueHw : public CommandQueue { using BaseClass = CommandQueue; public: CommandQueueHw(Context *context, ClDevice *device, const cl_queue_properties *properties, bool internalUsage) : BaseClass(context, device, properties, internalUsage) { auto clPriority = getCmdQueueProperties(properties, CL_QUEUE_PRIORITY_KHR); if (clPriority & static_cast(CL_QUEUE_PRIORITY_LOW_KHR)) { priority = QueuePriority::LOW; this->gpgpuEngine = &device->getNearestGenericSubDevice(0)->getEngine(getChosenEngineType(device->getHardwareInfo()), EngineUsage::LowPriority); } else if (clPriority & static_cast(CL_QUEUE_PRIORITY_MED_KHR)) { priority = QueuePriority::MEDIUM; } else if (clPriority & static_cast(CL_QUEUE_PRIORITY_HIGH_KHR)) { priority = QueuePriority::HIGH; } auto clThrottle = getCmdQueueProperties(properties, CL_QUEUE_THROTTLE_KHR); if (clThrottle & static_cast(CL_QUEUE_THROTTLE_LOW_KHR)) { throttle = QueueThrottle::LOW; } else if (clThrottle & static_cast(CL_QUEUE_THROTTLE_MED_KHR)) { throttle = QueueThrottle::MEDIUM; } else if (clThrottle & static_cast(CL_QUEUE_THROTTLE_HIGH_KHR)) { throttle = QueueThrottle::HIGH; } if (internalUsage) { this->gpgpuEngine = &device->getInternalEngine(); } if (gpgpuEngine) { this->initializeGpgpuInternals(); } uint64_t requestedSliceCount = getCmdQueueProperties(properties, CL_QUEUE_SLICE_COUNT_INTEL); if (requestedSliceCount > 0) { sliceCount = requestedSliceCount; } auto initializeGpgpu = false; if (DebugManager.flags.DeferCmdQGpgpuInitialization.get() != -1) { initializeGpgpu = !DebugManager.flags.DeferCmdQGpgpuInitialization.get(); } if (initializeGpgpu) { this->initializeGpgpu(); } for (const EngineControl *engine : bcsEngines) { if (engine != nullptr) { engine->osContext->ensureContextInitialized(); engine->commandStreamReceiver->initDirectSubmission(); } } } static CommandQueue *create(Context *context, ClDevice *device, const cl_queue_properties *properties, bool internalUsage) { return new CommandQueueHw(context, device, properties, internalUsage); } MOCKABLE_VIRTUAL void notifyEnqueueReadBuffer(Buffer *buffer, bool blockingRead, bool notifyBcsCsr); MOCKABLE_VIRTUAL void notifyEnqueueReadImage(Image *image, bool blockingRead, bool notifyBcsCsr); MOCKABLE_VIRTUAL void notifyEnqueueSVMMemcpy(GraphicsAllocation *gfxAllocation, bool blockingCopy, bool notifyBcsCsr); cl_int enqueueBarrierWithWaitList(cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) override; cl_int enqueueCopyBuffer(Buffer *srcBuffer, Buffer *dstBuffer, size_t srcOffset, size_t dstOffset, size_t size, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) override; cl_int enqueueCopyBufferRect(Buffer *srcBuffer, Buffer *dstBuffer, const size_t *srcOrigin, const size_t *dstOrigin, const size_t *region, size_t srcRowPitch, size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) override; cl_int enqueueCopyImage(Image *srcImage, Image *dstImage, const size_t *srcOrigin, const size_t *dstOrigin, const size_t *region, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) override; cl_int enqueueFillBuffer(Buffer *buffer, const void *pattern, size_t patternSize, size_t offset, size_t size, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) override; cl_int enqueueFillImage(Image *image, const void *fillColor, const size_t *origin, const size_t *region, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) override; cl_int enqueueKernel(Kernel *kernel, cl_uint workDim, const size_t *globalWorkOffset, const size_t *globalWorkSize, const size_t *localWorkSize, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) override; cl_int enqueueSVMMap(cl_bool blockingMap, cl_map_flags mapFlags, void *svmPtr, size_t size, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event, bool externalAppCall) override; cl_int enqueueSVMUnmap(void *svmPtr, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event, bool externalAppCall) override; cl_int enqueueSVMFree(cl_uint numSvmPointers, void *svmPointers[], void(CL_CALLBACK *pfnFreeFunc)(cl_command_queue queue, cl_uint numSvmPointers, void *svmPointers[], void *userData), void *userData, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) override; cl_int enqueueSVMMemcpy(cl_bool blockingCopy, void *dstPtr, const void *srcPtr, size_t size, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) override; cl_int enqueueSVMMemFill(void *svmPtr, const void *pattern, size_t patternSize, size_t size, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) override; cl_int enqueueMarkerWithWaitList(cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) override; cl_int enqueueMigrateMemObjects(cl_uint numMemObjects, const cl_mem *memObjects, cl_mem_migration_flags flags, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) override; cl_int enqueueSVMMigrateMem(cl_uint numSvmPointers, const void **svmPointers, const size_t *sizes, const cl_mem_migration_flags flags, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) override; cl_int enqueueReadBuffer(Buffer *buffer, cl_bool blockingRead, size_t offset, size_t size, void *ptr, GraphicsAllocation *mapAllocation, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) override; cl_int enqueueReadBufferRect(Buffer *buffer, cl_bool blockingRead, const size_t *bufferOrigin, const size_t *hostOrigin, const size_t *region, size_t bufferRowPitch, size_t bufferSlicePitch, size_t hostRowPitch, size_t hostSlicePitch, void *ptr, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) override; cl_int enqueueReadImage(Image *srcImage, cl_bool blockingRead, const size_t *origin, const size_t *region, size_t rowPitch, size_t slicePitch, void *ptr, GraphicsAllocation *mapAllocation, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) override; cl_int enqueueWriteBuffer(Buffer *buffer, cl_bool blockingWrite, size_t offset, size_t cb, const void *ptr, GraphicsAllocation *mapAllocation, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) override; cl_int enqueueWriteBufferRect(Buffer *buffer, cl_bool blockingWrite, const size_t *bufferOrigin, const size_t *hostOrigin, const size_t *region, size_t bufferRowPitch, size_t bufferSlicePitch, size_t hostRowPitch, size_t hostSlicePitch, const void *ptr, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) override; cl_int enqueueWriteImage(Image *dstImage, cl_bool blockingWrite, const size_t *origin, const size_t *region, size_t inputRowPitch, size_t inputSlicePitch, const void *ptr, GraphicsAllocation *mapAllocation, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) override; cl_int enqueueCopyBufferToImage(Buffer *srcBuffer, Image *dstImage, size_t srcOffset, const size_t *dstOrigin, const size_t *region, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) override; cl_int enqueueCopyImageToBuffer(Image *srcImage, Buffer *dstBuffer, const size_t *srcOrigin, const size_t *region, size_t dstOffset, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) override; cl_int enqueueResourceBarrier(BarrierCommand *resourceBarrier, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) override; cl_int finish() override; cl_int flush() override; template cl_int enqueueHandler(Surface **surfacesForResidency, size_t numSurfaceForResidency, bool blocking, const MultiDispatchInfo &dispatchInfo, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event); template cl_int enqueueHandler(Surface *(&surfacesForResidency)[size], bool blocking, const MultiDispatchInfo &dispatchInfo, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) { return enqueueHandler(surfacesForResidency, size, blocking, dispatchInfo, numEventsInWaitList, eventWaitList, event); } template cl_int enqueueHandler(Surface *(&surfacesForResidency)[size], bool blocking, Kernel *kernel, cl_uint workDim, const size_t globalOffsets[3], const size_t workItems[3], const size_t *localWorkSizesIn, const size_t *enqueuedWorkSizes, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event); template cl_int dispatchBcsOrGpgpuEnqueue(MultiDispatchInfo &dispatchInfo, Surface *(&surfaces)[surfaceCount], EBuiltInOps::Type builtInOperation, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event, bool blocking, CommandStreamReceiver &csr); template cl_int enqueueBlit(const MultiDispatchInfo &multiDispatchInfo, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event, bool blocking, CommandStreamReceiver &bcsCsr); bool isSplitEnqueueBlitNeeded(TransferDirection transferDirection, size_t transferSize, CommandStreamReceiver &csr); size_t getTotalSizeFromRectRegion(const size_t *region); template cl_int enqueueBlitSplit(MultiDispatchInfo &dispatchInfo, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event, bool blocking, CommandStreamReceiver &csr); template CompletionStamp enqueueNonBlocked(Surface **surfacesForResidency, size_t surfaceCount, LinearStream &commandStream, size_t commandStreamStart, bool &blocking, bool clearDependenciesForSubCapture, const MultiDispatchInfo &multiDispatchInfo, const EnqueueProperties &enqueueProperties, TimestampPacketDependencies ×tampPacketDependencies, EventsRequest &eventsRequest, EventBuilder &eventBuilder, TaskCountType taskLevel, PrintfHandler *printfHandler); void enqueueBlocked(uint32_t commandType, Surface **surfacesForResidency, size_t surfacesCount, const MultiDispatchInfo &multiDispatchInfo, TimestampPacketDependencies ×tampPacketDependencies, std::unique_ptr &blockedCommandsData, const EnqueueProperties &enqueueProperties, EventsRequest &eventsRequest, EventBuilder &externalEventBuilder, std::unique_ptr &&printfHandler, CommandStreamReceiver *bcsCsr); CompletionStamp enqueueCommandWithoutKernel(Surface **surfaces, size_t surfaceCount, LinearStream *commandStream, size_t commandStreamStart, bool &blocking, const EnqueueProperties &enqueueProperties, TimestampPacketDependencies ×tampPacketDependencies, EventsRequest &eventsRequest, EventBuilder &eventBuilder, TaskCountType taskLevel, CsrDependencies &csrDeps, CommandStreamReceiver *bcsCsr); void processDispatchForCacheFlush(Surface **surfaces, size_t numSurfaces, LinearStream *commandStream, CsrDependencies &csrDeps); void processDispatchForMarker(CommandQueue &commandQueue, LinearStream *commandStream, EventsRequest &eventsRequest, CsrDependencies &csrDeps); void processDispatchForMarkerWithTimestampPacket(CommandQueue &commandQueue, LinearStream *commandStream, EventsRequest &eventsRequest, CsrDependencies &csrDeps); BlitProperties processDispatchForBlitEnqueue(CommandStreamReceiver &blitCommandStreamReceiver, const MultiDispatchInfo &multiDispatchInfo, TimestampPacketDependencies ×tampPacketDependencies, const EventsRequest &eventsRequest, LinearStream *commandStream, uint32_t commandType, bool queueBlocked); void submitCacheFlush(Surface **surfaces, size_t numSurfaces, LinearStream *commandStream, uint64_t postSyncAddress); bool isCacheFlushCommand(uint32_t commandType) const override; bool waitForTimestamps(Range copyEnginesToWait, TaskCountType taskCount, WaitStatus &status, TimestampPacketContainer *mainContainer, TimestampPacketContainer *deferredContainer) override; MOCKABLE_VIRTUAL bool isCacheFlushForBcsRequired() const; protected: MOCKABLE_VIRTUAL void enqueueHandlerHook(const unsigned int commandType, const MultiDispatchInfo &dispatchInfo){}; size_t calculateHostPtrSizeForImage(const size_t *region, size_t rowPitch, size_t slicePitch, Image *image); cl_int enqueueReadWriteBufferOnCpuWithMemoryTransfer(cl_command_type commandType, Buffer *buffer, size_t offset, size_t size, void *ptr, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event); cl_int enqueueReadWriteBufferOnCpuWithoutMemoryTransfer(cl_command_type commandType, Buffer *buffer, size_t offset, size_t size, void *ptr, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event); cl_int enqueueMarkerForReadWriteOperation(MemObj *memObj, void *ptr, cl_command_type commandType, cl_bool blocking, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event); MOCKABLE_VIRTUAL void dispatchAuxTranslationBuiltin(MultiDispatchInfo &multiDispatchInfo, AuxTranslationDirection auxTranslationDirection); void setupBlitAuxTranslation(MultiDispatchInfo &multiDispatchInfo); MOCKABLE_VIRTUAL bool forceStateless(size_t size); template LinearStream *obtainCommandStream(const CsrDependencies &csrDependencies, bool blitEnqueue, bool blockedQueue, const MultiDispatchInfo &multiDispatchInfo, const EventsRequest &eventsRequest, std::unique_ptr &blockedCommandsData, Surface **surfaces, size_t numSurfaces, bool isMarkerWithProfiling) { LinearStream *commandStream = nullptr; bool profilingRequired = (this->isProfilingEnabled() && eventsRequest.outEvent); bool perfCountersRequired = (this->isPerfCountersEnabled() && eventsRequest.outEvent); if (isBlockedCommandStreamRequired(commandType, eventsRequest, blockedQueue, isMarkerWithProfiling)) { constexpr size_t additionalAllocationSize = CSRequirements::csOverfetchSize; constexpr size_t allocationSize = MemoryConstants::pageSize64k - CSRequirements::csOverfetchSize; commandStream = new LinearStream(); auto &gpgpuCsr = getGpgpuCommandStreamReceiver(); gpgpuCsr.ensureCommandBufferAllocation(*commandStream, allocationSize, additionalAllocationSize); blockedCommandsData = std::make_unique(commandStream, *gpgpuCsr.getInternalAllocationStorage()); } else { commandStream = &getCommandStream(*this, csrDependencies, profilingRequired, perfCountersRequired, blitEnqueue, multiDispatchInfo, surfaces, numSurfaces, isMarkerWithProfiling, eventsRequest.numEventsInWaitList > 0); } return commandStream; } void processDispatchForBlitAuxTranslation(CommandStreamReceiver &bcsCsr, const MultiDispatchInfo &multiDispatchInfo, BlitPropertiesContainer &blitPropertiesContainer, TimestampPacketDependencies ×tampPacketDependencies, const EventsRequest &eventsRequest, bool queueBlocked); bool obtainTimestampPacketForCacheFlush(bool isCacheFlushRequired) const override; bool isTaskLevelUpdateRequired(const TaskCountType &taskLevel, const cl_event *eventWaitList, const cl_uint &numEventsInWaitList, unsigned int commandType); void obtainTaskLevelAndBlockedStatus(TaskCountType &taskLevel, cl_uint &numEventsInWaitList, const cl_event *&eventWaitList, bool &blockQueueStatus, unsigned int commandType) override; static void computeOffsetsValueForRectCommands(size_t *bufferOffset, size_t *hostOffset, const size_t *bufferOrigin, const size_t *hostOrigin, const size_t *region, size_t bufferRowPitch, size_t bufferSlicePitch, size_t hostRowPitch, size_t hostSlicePitch); template void processDispatchForKernels(const MultiDispatchInfo &multiDispatchInfo, std::unique_ptr &printfHandler, Event *event, TagNodeBase *&hwTimeStamps, bool blockQueue, CsrDependencies &csrDeps, KernelOperation *blockedCommandsData, TimestampPacketDependencies ×tampPacketDependencies); MOCKABLE_VIRTUAL bool isGpgpuSubmissionForBcsRequired(bool queueBlocked, TimestampPacketDependencies ×tampPacketDependencies) const; void setupEvent(EventBuilder &eventBuilder, cl_event *outEvent, uint32_t cmdType); bool isBlitAuxTranslationRequired(const MultiDispatchInfo &multiDispatchInfo); }; } // namespace NEO