compute-runtime/opencl/source/command_queue/gpgpu_walker_xehp_and_later...

/*
 * Copyright (C) 2021-2023 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
 */

#pragma once
#include "shared/source/command_container/implicit_scaling.h"
#include "shared/source/debug_settings/debug_settings_manager.h"
#include "shared/source/execution_environment/root_device_environment.h"
#include "shared/source/gmm_helper/gmm_helper.h"
#include "shared/source/helpers/cache_flush_xehp_and_later.inl"
#include "shared/source/helpers/hw_helper.h"
#include "shared/source/helpers/l3_range.h"
#include "shared/source/helpers/simd_helper.h"

#include "opencl/source/command_queue/gpgpu_walker_base.inl"
#include "opencl/source/platform/platform.h"

namespace NEO {

template <typename GfxFamily>
size_t GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(
    WALKER_TYPE *walkerCmd,
    const KernelDescriptor &kernelDescriptor,
    const size_t globalOffsets[3],
    const size_t startWorkGroups[3],
    const size_t numWorkGroups[3],
    const size_t localWorkSizesIn[3],
    uint32_t simd,
    uint32_t workDim,
    bool localIdsGenerationByRuntime,
    bool inlineDataProgrammingRequired,
    uint32_t requiredWorkGroupOrder) {

    bool kernelUsesLocalIds = kernelDescriptor.kernelAttributes.numLocalIdChannels > 0;

    auto localWorkSize = localWorkSizesIn[0] * localWorkSizesIn[1] * localWorkSizesIn[2];

    walkerCmd->setThreadGroupIdXDimension(static_cast<uint32_t>(numWorkGroups[0]));
    walkerCmd->setThreadGroupIdYDimension(static_cast<uint32_t>(numWorkGroups[1]));
    walkerCmd->setThreadGroupIdZDimension(static_cast<uint32_t>(numWorkGroups[2]));

    // compute executionMask - to tell which SIMD lines are active within thread
    auto remainderSimdLanes = localWorkSize & (simd - 1);
    uint64_t executionMask = maxNBitValue(remainderSimdLanes);
    if (!executionMask) {
        executionMask = maxNBitValue((simd == 1) ? 32 : simd);
    }

    walkerCmd->setExecutionMask(static_cast<uint32_t>(executionMask));
    walkerCmd->setSimdSize(getSimdConfig<WALKER_TYPE>(simd));
    walkerCmd->setMessageSimd(walkerCmd->getSimdSize());

    if (DebugManager.flags.ForceSimdMessageSizeInWalker.get() != -1) {
        walkerCmd->setMessageSimd(DebugManager.flags.ForceSimdMessageSizeInWalker.get());
    }

    walkerCmd->setThreadGroupIdStartingX(static_cast<uint32_t>(startWorkGroups[0]));
    walkerCmd->setThreadGroupIdStartingY(static_cast<uint32_t>(startWorkGroups[1]));
    walkerCmd->setThreadGroupIdStartingZ(static_cast<uint32_t>(startWorkGroups[2]));

    // 1) cross-thread inline data will be put into R1, but if kernel uses local ids, then cross-thread should be put further back
    // so whenever local ids are driver or hw generated, reserve space by setting right values for emitLocalIds
    // 2) Auto-generation of local ids should be possible, when in fact local ids are used
    if (!localIdsGenerationByRuntime && kernelUsesLocalIds) {
        uint32_t emitLocalIdsForDim = 0;
        if (kernelDescriptor.kernelAttributes.localId[0]) {
            emitLocalIdsForDim |= (1 << 0);
        }
        if (kernelDescriptor.kernelAttributes.localId[1]) {
            emitLocalIdsForDim |= (1 << 1);
        }
        if (kernelDescriptor.kernelAttributes.localId[2]) {
            emitLocalIdsForDim |= (1 << 2);
        }
        walkerCmd->setEmitLocalId(emitLocalIdsForDim);
    }
    if (inlineDataProgrammingRequired == true) {
        walkerCmd->setEmitInlineParameter(1);
    }

    if ((!localIdsGenerationByRuntime) && kernelUsesLocalIds) {
        walkerCmd->setLocalXMaximum(static_cast<uint32_t>(localWorkSizesIn[0] - 1));
        walkerCmd->setLocalYMaximum(static_cast<uint32_t>(localWorkSizesIn[1] - 1));
        walkerCmd->setLocalZMaximum(static_cast<uint32_t>(localWorkSizesIn[2] - 1));

        walkerCmd->setGenerateLocalId(1);
        walkerCmd->setWalkOrder(requiredWorkGroupOrder);
    }

    return localWorkSize;
}

template <typename GfxFamily>
void GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(LinearStream *cmdStream,
                                                        WALKER_TYPE *walkerCmd,
                                                        TagNodeBase *timestampPacketNode,
                                                        const RootDeviceEnvironment &rootDeviceEnvironment) {
    using COMPUTE_WALKER = typename GfxFamily::COMPUTE_WALKER;

    const auto &hwInfo = *rootDeviceEnvironment.getHardwareInfo();
    auto &postSyncData = walkerCmd->getPostSync();
    postSyncData.setDataportPipelineFlush(true);

    EncodeDispatchKernel<GfxFamily>::setupPostSyncMocs(*walkerCmd, rootDeviceEnvironment,
                                                       MemorySynchronizationCommands<GfxFamily>::getDcFlushEnable(true, rootDeviceEnvironment));

    EncodeDispatchKernel<GfxFamily>::adjustTimestampPacket(*walkerCmd, hwInfo);

    if (DebugManager.flags.UseImmDataWriteModeOnPostSyncOperation.get()) {
        postSyncData.setOperation(GfxFamily::POSTSYNC_DATA::OPERATION::OPERATION_WRITE_IMMEDIATE_DATA);
        auto contextEndAddress = TimestampPacketHelper::getContextEndGpuAddress(*timestampPacketNode);
        postSyncData.setDestinationAddress(contextEndAddress);
        postSyncData.setImmediateData(0x2'0000'0002);
    } else {
        postSyncData.setOperation(GfxFamily::POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP);
        auto contextStartAddress = TimestampPacketHelper::getContextStartGpuAddress(*timestampPacketNode);
        postSyncData.setDestinationAddress(contextStartAddress);
    }
    if (DebugManager.flags.OverrideSystolicInComputeWalker.get() != -1) {
        walkerCmd->setSystolicModeEnable((DebugManager.flags.OverrideSystolicInComputeWalker.get()));
    }
}

template <typename GfxFamily>
void GpgpuWalkerHelper<GfxFamily>::adjustMiStoreRegMemMode(MI_STORE_REG_MEM<GfxFamily> *storeCmd) {
    storeCmd->setMmioRemapEnable(true);
}

template <typename GfxFamily>
size_t EnqueueOperation<GfxFamily>::getSizeRequiredCSKernel(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel, const DispatchInfo &dispatchInfo) {
    size_t numBarriers = MemorySynchronizationCommands<GfxFamily>::isBarrierWaRequired(commandQueue.getDevice().getHardwareInfo()) ? 2 : 1;

    size_t size = sizeof(typename GfxFamily::COMPUTE_WALKER) +
                  (MemorySynchronizationCommands<GfxFamily>::getSizeForSingleBarrier(false) * numBarriers) +
                  HardwareCommandsHelper<GfxFamily>::getSizeRequiredCS() +
                  EncodeMemoryPrefetch<GfxFamily>::getSizeForMemoryPrefetch(pKernel->getKernelInfo().heapInfo.KernelHeapSize, commandQueue.getDevice().getRootDeviceEnvironment());
    auto devices = commandQueue.getGpgpuCommandStreamReceiver().getOsContext().getDeviceBitfield();
    auto partitionWalker = ImplicitScalingHelper::isImplicitScalingEnabled(devices, true);
    if (partitionWalker) {
        Vec3<size_t> groupStart = dispatchInfo.getStartOfWorkgroups();
        Vec3<size_t> groupCount = dispatchInfo.getNumberOfWorkgroups();
        UNRECOVERABLE_IF(groupCount.x == 0);
        const bool staticPartitioning = commandQueue.getGpgpuCommandStreamReceiver().isStaticWorkPartitioningEnabled();
        size += static_cast<size_t>(ImplicitScalingDispatch<GfxFamily>::getSize(false, staticPartitioning, devices, groupStart, groupCount));
    }

    size += PerformanceCounters::getGpuCommandsSize(commandQueue.getPerfCounters(), commandQueue.getGpgpuEngine().osContext->getEngineType(), reservePerfCounters);

    return size;
}

template <typename GfxFamily>
size_t EnqueueOperation<GfxFamily>::getSizeRequiredForTimestampPacketWrite() {
    return 0;
}

template <typename GfxFamily>
void GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsStart(TagNodeBase &hwTimeStamps, LinearStream *commandStream, const RootDeviceEnvironment &rootDeviceEnvironment) {
}

template <typename GfxFamily>
void GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsEnd(TagNodeBase &hwTimeStamps, LinearStream *commandStream, const RootDeviceEnvironment &rootDeviceEnvironment) {
}

template <typename GfxFamily>
size_t EnqueueOperation<GfxFamily>::getSizeForCacheFlushAfterWalkerCommands(const Kernel &kernel, const CommandQueue &commandQueue) {
    size_t size = 0;

    if (kernel.requiresCacheFlushCommand(commandQueue)) {
        size += MemorySynchronizationCommands<GfxFamily>::getSizeForSingleBarrier(false);

        if constexpr (GfxFamily::isUsingL3Control) {
            StackVec<GraphicsAllocation *, 32> allocationsForCacheFlush;
            kernel.getAllocationsForCacheFlush(allocationsForCacheFlush);

            StackVec<L3Range, 128> subranges;
            for (auto &allocation : allocationsForCacheFlush) {
                coverRangeExact(allocation->getGpuAddress(), allocation->getUnderlyingBufferSize(), subranges, GfxFamily::L3_FLUSH_ADDRESS_RANGE::L3_FLUSH_EVICTION_POLICY_FLUSH_L3_WITH_EVICTION);
            }

            size += getSizeNeededToFlushGpuCache<GfxFamily>(subranges, true);
        }
    }

    return size;
}

} // namespace NEO