/* * Copyright (C) 2021-2023 Intel Corporation * * SPDX-License-Identifier: MIT * */ #pragma once #include "shared/source/command_container/implicit_scaling.h" #include "shared/source/debug_settings/debug_settings_manager.h" #include "shared/source/execution_environment/root_device_environment.h" #include "shared/source/gmm_helper/gmm_helper.h" #include "shared/source/helpers/cache_flush_xehp_and_later.inl" #include "shared/source/helpers/gfx_core_helper.h" #include "shared/source/helpers/l3_range.h" #include "shared/source/helpers/simd_helper.h" #include "opencl/source/command_queue/gpgpu_walker_base.inl" #include "opencl/source/platform/platform.h" namespace NEO { template size_t GpgpuWalkerHelper::setGpgpuWalkerThreadData( WALKER_TYPE *walkerCmd, const KernelDescriptor &kernelDescriptor, const size_t globalOffsets[3], const size_t startWorkGroups[3], const size_t numWorkGroups[3], const size_t localWorkSizesIn[3], uint32_t simd, uint32_t workDim, bool localIdsGenerationByRuntime, bool inlineDataProgrammingRequired, uint32_t requiredWorkGroupOrder) { bool kernelUsesLocalIds = kernelDescriptor.kernelAttributes.numLocalIdChannels > 0; auto localWorkSize = localWorkSizesIn[0] * localWorkSizesIn[1] * localWorkSizesIn[2]; walkerCmd->setThreadGroupIdXDimension(static_cast(numWorkGroups[0])); walkerCmd->setThreadGroupIdYDimension(static_cast(numWorkGroups[1])); walkerCmd->setThreadGroupIdZDimension(static_cast(numWorkGroups[2])); // compute executionMask - to tell which SIMD lines are active within thread auto remainderSimdLanes = localWorkSize & (simd - 1); uint64_t executionMask = maxNBitValue(remainderSimdLanes); if (!executionMask) { executionMask = maxNBitValue((simd == 1) ? 32 : simd); } walkerCmd->setExecutionMask(static_cast(executionMask)); walkerCmd->setSimdSize(getSimdConfig(simd)); walkerCmd->setMessageSimd(walkerCmd->getSimdSize()); if (DebugManager.flags.ForceSimdMessageSizeInWalker.get() != -1) { walkerCmd->setMessageSimd(DebugManager.flags.ForceSimdMessageSizeInWalker.get()); } walkerCmd->setThreadGroupIdStartingX(static_cast(startWorkGroups[0])); walkerCmd->setThreadGroupIdStartingY(static_cast(startWorkGroups[1])); walkerCmd->setThreadGroupIdStartingZ(static_cast(startWorkGroups[2])); // 1) cross-thread inline data will be put into R1, but if kernel uses local ids, then cross-thread should be put further back // so whenever local ids are driver or hw generated, reserve space by setting right values for emitLocalIds // 2) Auto-generation of local ids should be possible, when in fact local ids are used if (!localIdsGenerationByRuntime && kernelUsesLocalIds) { uint32_t emitLocalIdsForDim = 0; if (kernelDescriptor.kernelAttributes.localId[0]) { emitLocalIdsForDim |= (1 << 0); } if (kernelDescriptor.kernelAttributes.localId[1]) { emitLocalIdsForDim |= (1 << 1); } if (kernelDescriptor.kernelAttributes.localId[2]) { emitLocalIdsForDim |= (1 << 2); } walkerCmd->setEmitLocalId(emitLocalIdsForDim); } if (inlineDataProgrammingRequired == true) { walkerCmd->setEmitInlineParameter(1); } if ((!localIdsGenerationByRuntime) && kernelUsesLocalIds) { walkerCmd->setLocalXMaximum(static_cast(localWorkSizesIn[0] - 1)); walkerCmd->setLocalYMaximum(static_cast(localWorkSizesIn[1] - 1)); walkerCmd->setLocalZMaximum(static_cast(localWorkSizesIn[2] - 1)); walkerCmd->setGenerateLocalId(1); walkerCmd->setWalkOrder(requiredWorkGroupOrder); } return localWorkSize; } template void GpgpuWalkerHelper::setupTimestampPacket(LinearStream *cmdStream, WALKER_TYPE *walkerCmd, TagNodeBase *timestampPacketNode, const RootDeviceEnvironment &rootDeviceEnvironment) { using COMPUTE_WALKER = typename GfxFamily::COMPUTE_WALKER; const auto &hwInfo = *rootDeviceEnvironment.getHardwareInfo(); auto &postSyncData = walkerCmd->getPostSync(); postSyncData.setDataportPipelineFlush(true); EncodeDispatchKernel::setupPostSyncMocs(*walkerCmd, rootDeviceEnvironment, MemorySynchronizationCommands::getDcFlushEnable(true, rootDeviceEnvironment)); EncodeDispatchKernel::adjustTimestampPacket(*walkerCmd, hwInfo); if (DebugManager.flags.UseImmDataWriteModeOnPostSyncOperation.get()) { postSyncData.setOperation(GfxFamily::POSTSYNC_DATA::OPERATION::OPERATION_WRITE_IMMEDIATE_DATA); auto contextEndAddress = TimestampPacketHelper::getContextEndGpuAddress(*timestampPacketNode); postSyncData.setDestinationAddress(contextEndAddress); postSyncData.setImmediateData(0x2'0000'0002); } else { postSyncData.setOperation(GfxFamily::POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP); auto contextStartAddress = TimestampPacketHelper::getContextStartGpuAddress(*timestampPacketNode); postSyncData.setDestinationAddress(contextStartAddress); } if (DebugManager.flags.OverrideSystolicInComputeWalker.get() != -1) { walkerCmd->setSystolicModeEnable((DebugManager.flags.OverrideSystolicInComputeWalker.get())); } } template void GpgpuWalkerHelper::adjustMiStoreRegMemMode(MI_STORE_REG_MEM *storeCmd) { storeCmd->setMmioRemapEnable(true); } template size_t EnqueueOperation::getSizeRequiredCSKernel(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel, const DispatchInfo &dispatchInfo) { size_t numBarriers = MemorySynchronizationCommands::isBarrierWaRequired(commandQueue.getDevice().getRootDeviceEnvironment()) ? 2 : 1; size_t size = sizeof(typename GfxFamily::COMPUTE_WALKER) + (MemorySynchronizationCommands::getSizeForSingleBarrier(false) * numBarriers) + HardwareCommandsHelper::getSizeRequiredCS() + EncodeMemoryPrefetch::getSizeForMemoryPrefetch(pKernel->getKernelInfo().heapInfo.kernelHeapSize, commandQueue.getDevice().getRootDeviceEnvironment()); auto devices = commandQueue.getGpgpuCommandStreamReceiver().getOsContext().getDeviceBitfield(); auto partitionWalker = ImplicitScalingHelper::isImplicitScalingEnabled(devices, true); if (partitionWalker) { Vec3 groupStart = dispatchInfo.getStartOfWorkgroups(); Vec3 groupCount = dispatchInfo.getNumberOfWorkgroups(); UNRECOVERABLE_IF(groupCount.x == 0); const bool staticPartitioning = commandQueue.getGpgpuCommandStreamReceiver().isStaticWorkPartitioningEnabled(); size += static_cast(ImplicitScalingDispatch::getSize(false, staticPartitioning, devices, groupStart, groupCount)); } size += PerformanceCounters::getGpuCommandsSize(commandQueue.getPerfCounters(), commandQueue.getGpgpuEngine().osContext->getEngineType(), reservePerfCounters); return size; } template size_t EnqueueOperation::getSizeRequiredForTimestampPacketWrite() { return 0; } template void GpgpuWalkerHelper::dispatchProfilingCommandsStart(TagNodeBase &hwTimeStamps, LinearStream *commandStream, const RootDeviceEnvironment &rootDeviceEnvironment) { } template void GpgpuWalkerHelper::dispatchProfilingCommandsEnd(TagNodeBase &hwTimeStamps, LinearStream *commandStream, const RootDeviceEnvironment &rootDeviceEnvironment) { } template size_t EnqueueOperation::getSizeForCacheFlushAfterWalkerCommands(const Kernel &kernel, const CommandQueue &commandQueue) { size_t size = 0; if (kernel.requiresCacheFlushCommand(commandQueue)) { size += MemorySynchronizationCommands::getSizeForSingleBarrier(false); if constexpr (GfxFamily::isUsingL3Control) { StackVec allocationsForCacheFlush; kernel.getAllocationsForCacheFlush(allocationsForCacheFlush); StackVec subranges; for (auto &allocation : allocationsForCacheFlush) { coverRangeExact(allocation->getGpuAddress(), allocation->getUnderlyingBufferSize(), subranges, GfxFamily::L3_FLUSH_ADDRESS_RANGE::L3_FLUSH_EVICTION_POLICY_FLUSH_L3_WITH_EVICTION); } size += getSizeNeededToFlushGpuCache(subranges, true); } } return size; } } // namespace NEO