compute-runtime/opencl/source/command_queue/hardware_interface_xehp_and...

/*
 * Copyright (C) 2021-2022 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
 */

#pragma once
#include "shared/source/command_container/command_encoder.h"
#include "shared/source/command_container/implicit_scaling.h"
#include "shared/source/debug_settings/debug_settings_manager.h"
#include "shared/source/helpers/engine_node_helper.h"
#include "shared/source/os_interface/os_context.h"
#include "shared/source/os_interface/os_interface.h"
#include "shared/source/utilities/tag_allocator.h"

#include "opencl/source/command_queue/hardware_interface_base.inl"

namespace NEO {

template <typename GfxFamily>
inline void HardwareInterface<GfxFamily>::getDefaultDshSpace(
    const size_t &offsetInterfaceDescriptorTable,
    CommandQueue &commandQueue,
    const MultiDispatchInfo &multiDispatchInfo,
    size_t &totalInterfaceDescriptorTableSize,
    IndirectHeap *dsh,
    LinearStream *commandStream) {
}

template <typename GfxFamily>
inline void HardwareInterface<GfxFamily>::dispatchWorkarounds(
    LinearStream *commandStream,
    CommandQueue &commandQueue,
    Kernel &kernel,
    const bool &enable) {
}

template <typename GfxFamily>
inline void HardwareInterface<GfxFamily>::programWalker(
    LinearStream &commandStream,
    Kernel &kernel,
    CommandQueue &commandQueue,
    TimestampPacketContainer *currentTimestampPacketNodes,
    IndirectHeap &dsh,
    IndirectHeap &ioh,
    IndirectHeap &ssh,
    size_t globalWorkSizes[3],
    size_t localWorkSizes[3],
    PreemptionMode preemptionMode,
    size_t currentDispatchIndex,
    uint32_t &interfaceDescriptorIndex,
    const DispatchInfo &dispatchInfo,
    size_t offsetInterfaceDescriptorTable,
    const Vec3<size_t> &numberOfWorkgroups,
    const Vec3<size_t> &startOfWorkgroups) {

    using COMPUTE_WALKER = typename GfxFamily::COMPUTE_WALKER;

    COMPUTE_WALKER walkerCmd = GfxFamily::cmdInitGpgpuWalker;
    auto &kernelInfo = kernel.getKernelInfo();

    uint32_t dim = dispatchInfo.getDim();
    uint32_t simd = kernelInfo.getMaxSimdSize();

    auto numChannels = kernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels;

    size_t globalOffsets[3] = {dispatchInfo.getOffset().x, dispatchInfo.getOffset().y, dispatchInfo.getOffset().z};
    size_t startWorkGroups[3] = {startOfWorkgroups.x, startOfWorkgroups.y, startOfWorkgroups.z};
    size_t numWorkGroups[3] = {numberOfWorkgroups.x, numberOfWorkgroups.y, numberOfWorkgroups.z};
    uint32_t requiredWalkOrder = 0u;

    bool localIdsGenerationByRuntime = EncodeDispatchKernel<GfxFamily>::isRuntimeLocalIdsGenerationRequired(
        numChannels,
        localWorkSizes,
        std::array<uint8_t, 3>{{kernelInfo.kernelDescriptor.kernelAttributes.workgroupWalkOrder[0],
                                kernelInfo.kernelDescriptor.kernelAttributes.workgroupWalkOrder[1],
                                kernelInfo.kernelDescriptor.kernelAttributes.workgroupWalkOrder[2]}},
        kernelInfo.kernelDescriptor.kernelAttributes.flags.requiresWorkgroupWalkOrder,
        requiredWalkOrder,
        simd);

    bool inlineDataProgrammingRequired = HardwareCommandsHelper<GfxFamily>::inlineDataProgrammingRequired(kernel);
    auto idd = &walkerCmd.getInterfaceDescriptor();
    auto &queueCsr = commandQueue.getGpgpuCommandStreamReceiver();

    if (currentTimestampPacketNodes && queueCsr.peekTimestampPacketWriteEnabled()) {
        auto timestampPacket = currentTimestampPacketNodes->peekNodes().at(currentDispatchIndex);
        GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(&commandStream, &walkerCmd, timestampPacket, commandQueue.getDevice().getRootDeviceEnvironment());
    }

    auto isCcsUsed = EngineHelpers::isCcs(commandQueue.getGpgpuEngine().osContext->getEngineType());
    auto kernelUsesLocalIds = HardwareCommandsHelper<GfxFamily>::kernelUsesLocalIds(kernel);

    const auto &hwInfo = commandQueue.getDevice().getHardwareInfo();
    if (auto kernelAllocation = kernelInfo.getGraphicsAllocation()) {
        EncodeMemoryPrefetch<GfxFamily>::programMemoryPrefetch(commandStream, *kernelAllocation, kernelInfo.heapInfo.KernelHeapSize, 0, hwInfo);
    }

    HardwareCommandsHelper<GfxFamily>::sendIndirectState(
        commandStream,
        dsh,
        ioh,
        ssh,
        kernel,
        kernel.getKernelStartAddress(localIdsGenerationByRuntime, kernelUsesLocalIds, isCcsUsed, false),
        simd,
        localWorkSizes,
        offsetInterfaceDescriptorTable,
        interfaceDescriptorIndex,
        preemptionMode,
        &walkerCmd,
        idd,
        localIdsGenerationByRuntime,
        commandQueue.getDevice());

    GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(&walkerCmd, kernelInfo.kernelDescriptor, globalOffsets, startWorkGroups,
                                                           numWorkGroups, localWorkSizes, simd, dim,
                                                           localIdsGenerationByRuntime, inlineDataProgrammingRequired, requiredWalkOrder);

    EncodeWalkerArgs walkerArgs{kernel.getExecutionType(), true};
    EncodeDispatchKernel<GfxFamily>::encodeAdditionalWalkerFields(hwInfo, walkerCmd, walkerArgs);

    auto devices = queueCsr.getOsContext().getDeviceBitfield();
    auto partitionWalker = ImplicitScalingHelper::isImplicitScalingEnabled(devices, !kernel.isSingleSubdevicePreferred());

    if (partitionWalker) {
        const uint64_t workPartitionAllocationGpuVa = commandQueue.getDevice().getDefaultEngine().commandStreamReceiver->getWorkPartitionAllocationGpuAddress();
        uint32_t partitionCount = 0u;
        ImplicitScalingDispatch<GfxFamily>::dispatchCommands(commandStream,
                                                             walkerCmd,
                                                             devices,
                                                             partitionCount,
                                                             false,
                                                             false,
                                                             kernel.usesImages(),
                                                             workPartitionAllocationGpuVa,
                                                             hwInfo);
        if (queueCsr.isStaticWorkPartitioningEnabled()) {
            queueCsr.setActivePartitions(std::max(queueCsr.getActivePartitions(), partitionCount));
        }
        auto timestampPacket = currentTimestampPacketNodes->peekNodes().at(currentDispatchIndex);
        timestampPacket->setPacketsUsed(partitionCount);
    } else {
        auto computeWalkerOnStream = commandStream.getSpaceForCmd<typename GfxFamily::COMPUTE_WALKER>();
        *computeWalkerOnStream = walkerCmd;
    }
}
} // namespace NEO
Partial support for XE_HP_SDV Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com> 2021-04-24 00:43:48 +08:00			`/*`
Remove device enqueue part 6 - isParentKernel, peekParentKernel, parentKernel - structs: AUBParentKernelFixture, MockParentKernel, ParentKernelCommandQueueFixture Related-To: NEO-6559 Signed-off-by: Katarzyna Cencelewska <katarzyna.cencelewska@intel.com> 2022-01-13 23:27:58 +08:00			`* Copyright (C) 2021-2022 Intel Corporation`
Partial support for XE_HP_SDV Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com> 2021-04-24 00:43:48 +08:00			`*`
			`* SPDX-License-Identifier: MIT`
			`*`
			`*/`

			`#pragma once`
			`#include "shared/source/command_container/command_encoder.h"`
			`#include "shared/source/command_container/implicit_scaling.h"`
			`#include "shared/source/debug_settings/debug_settings_manager.h"`
			`#include "shared/source/helpers/engine_node_helper.h"`
			`#include "shared/source/os_interface/os_context.h"`
			`#include "shared/source/os_interface/os_interface.h"`
			`#include "shared/source/utilities/tag_allocator.h"`

			`#include "opencl/source/command_queue/hardware_interface_base.inl"`

			`namespace NEO {`

			`template <typename GfxFamily>`
			`inline void HardwareInterface<GfxFamily>::getDefaultDshSpace(`
			`const size_t &offsetInterfaceDescriptorTable,`
			`CommandQueue &commandQueue,`
			`const MultiDispatchInfo &multiDispatchInfo,`
			`size_t &totalInterfaceDescriptorTableSize,`
			`IndirectHeap *dsh,`
			`LinearStream *commandStream) {`
			`}`

			`template <typename GfxFamily>`
			`inline void HardwareInterface<GfxFamily>::dispatchWorkarounds(`
			`LinearStream *commandStream,`
			`CommandQueue &commandQueue,`
			`Kernel &kernel,`
			`const bool &enable) {`
			`}`

			`template <typename GfxFamily>`
			`inline void HardwareInterface<GfxFamily>::programWalker(`
			`LinearStream &commandStream,`
			`Kernel &kernel,`
			`CommandQueue &commandQueue,`
			`TimestampPacketContainer *currentTimestampPacketNodes,`
			`IndirectHeap &dsh,`
			`IndirectHeap &ioh,`
			`IndirectHeap &ssh,`
			`size_t globalWorkSizes[3],`
			`size_t localWorkSizes[3],`
			`PreemptionMode preemptionMode,`
			`size_t currentDispatchIndex,`
			`uint32_t &interfaceDescriptorIndex,`
			`const DispatchInfo &dispatchInfo,`
			`size_t offsetInterfaceDescriptorTable,`
Code cleanup - avoid copy 5/n Signed-off-by: Kamil Kopryk <kamil.kopryk@intel.com> 2021-09-08 05:21:19 +08:00			`const Vec3<size_t> &numberOfWorkgroups,`
			`const Vec3<size_t> &startOfWorkgroups) {`
Partial support for XE_HP_SDV Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com> 2021-04-24 00:43:48 +08:00
			`using COMPUTE_WALKER = typename GfxFamily::COMPUTE_WALKER;`

			`COMPUTE_WALKER walkerCmd = GfxFamily::cmdInitGpgpuWalker;`
			`auto &kernelInfo = kernel.getKernelInfo();`

			`uint32_t dim = dispatchInfo.getDim();`
			`uint32_t simd = kernelInfo.getMaxSimdSize();`

			`auto numChannels = kernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels;`

			`size_t globalOffsets[3] = {dispatchInfo.getOffset().x, dispatchInfo.getOffset().y, dispatchInfo.getOffset().z};`
			`size_t startWorkGroups[3] = {startOfWorkgroups.x, startOfWorkgroups.y, startOfWorkgroups.z};`
			`size_t numWorkGroups[3] = {numberOfWorkgroups.x, numberOfWorkgroups.y, numberOfWorkgroups.z};`
			`uint32_t requiredWalkOrder = 0u;`

			`bool localIdsGenerationByRuntime = EncodeDispatchKernel<GfxFamily>::isRuntimeLocalIdsGenerationRequired(`
			`numChannels,`
			`localWorkSizes,`
			`std::array<uint8_t, 3>{{kernelInfo.kernelDescriptor.kernelAttributes.workgroupWalkOrder[0],`
			`kernelInfo.kernelDescriptor.kernelAttributes.workgroupWalkOrder[1],`
			`kernelInfo.kernelDescriptor.kernelAttributes.workgroupWalkOrder[2]}},`
			`kernelInfo.kernelDescriptor.kernelAttributes.flags.requiresWorkgroupWalkOrder,`
			`requiredWalkOrder,`
			`simd);`

			`bool inlineDataProgrammingRequired = HardwareCommandsHelper<GfxFamily>::inlineDataProgrammingRequired(kernel);`
			`auto idd = &walkerCmd.getInterfaceDescriptor();`
Enable multi-tile task count post-sync writes Related-To: NEO-6244 Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com> 2021-09-24 02:13:37 +08:00			`auto &queueCsr = commandQueue.getGpgpuCommandStreamReceiver();`
Partial support for XE_HP_SDV Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com> 2021-04-24 00:43:48 +08:00
Enable multi-tile task count post-sync writes Related-To: NEO-6244 Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com> 2021-09-24 02:13:37 +08:00			`if (currentTimestampPacketNodes && queueCsr.peekTimestampPacketWriteEnabled()) {`
Partial support for XE_HP_SDV Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com> 2021-04-24 00:43:48 +08:00			`auto timestampPacket = currentTimestampPacketNodes->peekNodes().at(currentDispatchIndex);`
			`GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(&commandStream, &walkerCmd, timestampPacket, commandQueue.getDevice().getRootDeviceEnvironment());`
			`}`

			`auto isCcsUsed = EngineHelpers::isCcs(commandQueue.getGpgpuEngine().osContext->getEngineType());`
			`auto kernelUsesLocalIds = HardwareCommandsHelper<GfxFamily>::kernelUsesLocalIds(kernel);`

Add isDcFlushAllowed function to HwInfoConfig Signed-off-by: Filip Hazubski <filip.hazubski@intel.com> 2021-12-20 22:37:33 +08:00			`const auto &hwInfo = commandQueue.getDevice().getHardwareInfo();`
Partial support for XE_HP_SDV Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com> 2021-04-24 00:43:48 +08:00			`if (auto kernelAllocation = kernelInfo.getGraphicsAllocation()) {`
Add isDcFlushAllowed function to HwInfoConfig Signed-off-by: Filip Hazubski <filip.hazubski@intel.com> 2021-12-20 22:37:33 +08:00			`EncodeMemoryPrefetch<GfxFamily>::programMemoryPrefetch(commandStream, *kernelAllocation, kernelInfo.heapInfo.KernelHeapSize, 0, hwInfo);`
Partial support for XE_HP_SDV Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com> 2021-04-24 00:43:48 +08:00			`}`

			`HardwareCommandsHelper<GfxFamily>::sendIndirectState(`
			`commandStream,`
			`dsh,`
			`ioh,`
			`ssh,`
			`kernel,`
Add LogicalStateHelper getter for CommandQueue. Refactor Kernel handling Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com> 2022-06-28 01:20:50 +08:00			`kernel.getKernelStartAddress(localIdsGenerationByRuntime, kernelUsesLocalIds, isCcsUsed, false),`
Partial support for XE_HP_SDV Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com> 2021-04-24 00:43:48 +08:00			`simd,`
			`localWorkSizes,`
			`offsetInterfaceDescriptorTable,`
			`interfaceDescriptorIndex,`
			`preemptionMode,`
			`&walkerCmd,`
			`idd,`
			`localIdsGenerationByRuntime,`
			`commandQueue.getDevice());`

			`GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(&walkerCmd, kernelInfo.kernelDescriptor, globalOffsets, startWorkGroups,`
			`numWorkGroups, localWorkSizes, simd, dim,`
			`localIdsGenerationByRuntime, inlineDataProgrammingRequired, requiredWalkOrder);`

Change interface to method programing additional fields of command Related-To: NEO-6959 Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com> 2022-05-26 21:20:02 +08:00			`EncodeWalkerArgs walkerArgs{kernel.getExecutionType(), true};`
			`EncodeDispatchKernel<GfxFamily>::encodeAdditionalWalkerFields(hwInfo, walkerCmd, walkerArgs);`
Partial support for XE_HP_SDV Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com> 2021-04-24 00:43:48 +08:00
Enable multi-tile task count post-sync writes Related-To: NEO-6244 Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com> 2021-09-24 02:13:37 +08:00			`auto devices = queueCsr.getOsContext().getDeviceBitfield();`
Partial support for XE_HP_SDV Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com> 2021-04-24 00:43:48 +08:00			`auto partitionWalker = ImplicitScalingHelper::isImplicitScalingEnabled(devices, !kernel.isSingleSubdevicePreferred());`

			`if (partitionWalker) {`
			`const uint64_t workPartitionAllocationGpuVa = commandQueue.getDevice().getDefaultEngine().commandStreamReceiver->getWorkPartitionAllocationGpuAddress();`
			`uint32_t partitionCount = 0u;`
			`ImplicitScalingDispatch<GfxFamily>::dispatchCommands(commandStream,`
			`walkerCmd,`
			`devices,`
			`partitionCount,`
			`false,`
			`false,`
			`kernel.usesImages(),`
Add isDcFlushAllowed function to HwInfoConfig Signed-off-by: Filip Hazubski <filip.hazubski@intel.com> 2021-12-20 22:37:33 +08:00			`workPartitionAllocationGpuVa,`
			`hwInfo);`
Pass active partitions from dispatched kernel to context Related-To: NEO-6244 Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com> 2021-10-05 00:37:12 +08:00			`if (queueCsr.isStaticWorkPartitioningEnabled()) {`
			`queueCsr.setActivePartitions(std::max(queueCsr.getActivePartitions(), partitionCount));`
			`}`
Partial support for XE_HP_SDV Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com> 2021-04-24 00:43:48 +08:00			`auto timestampPacket = currentTimestampPacketNodes->peekNodes().at(currentDispatchIndex);`
			`timestampPacket->setPacketsUsed(partitionCount);`
			`} else {`
Remove RMW from gfx allocations Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com> 2021-07-09 20:14:05 +08:00			`auto computeWalkerOnStream = commandStream.getSpaceForCmd<typename GfxFamily::COMPUTE_WALKER>();`
Partial support for XE_HP_SDV Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com> 2021-04-24 00:43:48 +08:00			`*computeWalkerOnStream = walkerCmd;`
			`}`
			`}`
			`} // namespace NEO`