diff --git a/shared/source/command_container/CMakeLists.txt b/shared/source/command_container/CMakeLists.txt index 2ae2fb4dfd..c69b9302c6 100644 --- a/shared/source/command_container/CMakeLists.txt +++ b/shared/source/command_container/CMakeLists.txt @@ -10,7 +10,6 @@ set(NEO_CORE_COMMAND_CONTAINER ${CMAKE_CURRENT_SOURCE_DIR}/cmdcontainer.h ${CMAKE_CURRENT_SOURCE_DIR}/command_encoder.h ${CMAKE_CURRENT_SOURCE_DIR}/command_encoder.inl - ${CMAKE_CURRENT_SOURCE_DIR}/command_encoder_bdw_and_later.inl ${CMAKE_CURRENT_SOURCE_DIR}/command_encoder_enablers.inl ${CMAKE_CURRENT_SOURCE_DIR}/command_encoder_tgllp_and_later.inl ${CMAKE_CURRENT_SOURCE_DIR}/encode_alu_helper.h diff --git a/shared/source/command_container/command_encoder_bdw_and_later.inl b/shared/source/command_container/command_encoder_bdw_and_later.inl deleted file mode 100644 index e3930d6e29..0000000000 --- a/shared/source/command_container/command_encoder_bdw_and_later.inl +++ /dev/null @@ -1,692 +0,0 @@ -/* - * Copyright (C) 2020-2025 Intel Corporation - * - * SPDX-License-Identifier: MIT - * - */ - -#pragma once -#include "shared/source/command_container/command_encoder.h" -#include "shared/source/command_container/encode_surface_state.h" -#include "shared/source/command_stream/linear_stream.h" -#include "shared/source/command_stream/memory_compression_state.h" -#include "shared/source/command_stream/preemption.h" -#include "shared/source/execution_environment/execution_environment.h" -#include "shared/source/gmm_helper/gmm_helper.h" -#include "shared/source/helpers/api_specific_config.h" -#include "shared/source/helpers/cache_policy.h" -#include "shared/source/helpers/gfx_core_helper.h" -#include "shared/source/helpers/in_order_cmd_helpers.h" -#include "shared/source/helpers/pause_on_gpu_properties.h" -#include "shared/source/helpers/pipe_control_args.h" -#include "shared/source/helpers/pipeline_select_args.h" -#include "shared/source/helpers/simd_helper.h" -#include "shared/source/helpers/state_base_address.h" -#include "shared/source/kernel/dispatch_kernel_encoder_interface.h" -#include "shared/source/kernel/implicit_args_helper.h" - -#include - -namespace NEO { - -template -template -void EncodeDispatchKernel::setGrfInfo(InterfaceDescriptorType *pInterfaceDescriptor, uint32_t grfCount, - const size_t &sizeCrossThreadData, const size_t &sizePerThreadData, - const RootDeviceEnvironment &rootDeviceEnvironment) { - auto grfSize = sizeof(typename Family::GRF); - DEBUG_BREAK_IF((sizeCrossThreadData % grfSize) != 0); - auto numGrfCrossThreadData = static_cast(sizeCrossThreadData / grfSize); - DEBUG_BREAK_IF(numGrfCrossThreadData == 0); - pInterfaceDescriptor->setCrossThreadConstantDataReadLength(numGrfCrossThreadData); - - DEBUG_BREAK_IF((sizePerThreadData % grfSize) != 0); - auto numGrfPerThreadData = static_cast(sizePerThreadData / grfSize); - - // at least 1 GRF of perThreadData for each thread in a thread group when sizeCrossThreadData != 0 - numGrfPerThreadData = std::max(numGrfPerThreadData, 1u); - pInterfaceDescriptor->setConstantIndirectUrbEntryReadLength(numGrfPerThreadData); -} - -template -template -void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDispatchKernelArgs &args) { - - using MEDIA_STATE_FLUSH = typename Family::MEDIA_STATE_FLUSH; - using STATE_BASE_ADDRESS = typename Family::STATE_BASE_ADDRESS; - - auto &kernelDescriptor = args.dispatchInterface->getKernelDescriptor(); - auto sizeCrossThreadData = args.dispatchInterface->getCrossThreadDataSize(); - auto sizePerThreadData = args.dispatchInterface->getPerThreadDataSize(); - auto sizePerThreadDataForWholeGroup = args.dispatchInterface->getPerThreadDataSizeForWholeThreadGroup(); - auto pImplicitArgs = args.dispatchInterface->getImplicitArgs(); - - auto &hwInfo = args.device->getHardwareInfo(); - auto &rootDeviceEnvironment = args.device->getRootDeviceEnvironment(); - - LinearStream *listCmdBufferStream = container.getCommandStream(); - - auto threadGroupDims = static_cast(args.threadGroupDimensions); - - DefaultWalkerType cmd = Family::cmdInitGpgpuWalker; - auto idd = Family::cmdInitInterfaceDescriptorData; - { - auto alloc = args.dispatchInterface->getIsaAllocation(); - UNRECOVERABLE_IF(nullptr == alloc); - auto offset = alloc->getGpuAddressToPatch() + args.dispatchInterface->getIsaOffsetInParentAllocation(); - idd.setKernelStartPointer(offset); - idd.setKernelStartPointerHigh(0u); - } - - if (args.dispatchInterface->getKernelDescriptor().kernelAttributes.flags.usesAssert && args.device->getL0Debugger() != nullptr) { - idd.setSoftwareExceptionEnable(1); - } - - auto numThreadsPerThreadGroup = args.dispatchInterface->getNumThreadsPerThreadGroup(); - idd.setNumberOfThreadsInGpgpuThreadGroup(numThreadsPerThreadGroup); - - EncodeDispatchKernel::programBarrierEnable(idd, - kernelDescriptor, - hwInfo); - auto slmSize = EncodeDispatchKernel::computeSlmValues(hwInfo, args.dispatchInterface->getSlmTotalSize()); - idd.setSharedLocalMemorySize(slmSize); - - uint32_t bindingTableStateCount = kernelDescriptor.payloadMappings.bindingTable.numEntries; - uint32_t bindingTablePointer = 0u; - bool isBindlessKernel = NEO::KernelDescriptor::isBindlessAddressingKernel(kernelDescriptor); - - if (!isBindlessKernel) { - container.prepareBindfulSsh(); - if (bindingTableStateCount > 0u) { - auto ssh = args.surfaceStateHeap; - if (ssh == nullptr) { - ssh = container.getHeapWithRequiredSizeAndAlignment(HeapType::surfaceState, args.dispatchInterface->getSurfaceStateHeapDataSize(), NEO::EncodeDispatchKernel::getDefaultSshAlignment()); - } - bindingTablePointer = static_cast(EncodeSurfaceState::pushBindingTableAndSurfaceStates( - *ssh, - args.dispatchInterface->getSurfaceStateHeapData(), - args.dispatchInterface->getSurfaceStateHeapDataSize(), bindingTableStateCount, - kernelDescriptor.payloadMappings.bindingTable.tableOffset)); - } - } else { - bool globalBindlessSsh = args.device->getBindlessHeapsHelper() != nullptr; - auto sshHeapSize = args.dispatchInterface->getSurfaceStateHeapDataSize(); - - if (sshHeapSize > 0u) { - auto ssh = args.surfaceStateHeap; - if (ssh == nullptr) { - container.prepareBindfulSsh(); - ssh = container.getHeapWithRequiredSizeAndAlignment(HeapType::surfaceState, sshHeapSize, NEO::EncodeDispatchKernel::getDefaultSshAlignment()); - } - uint64_t bindlessSshBaseOffset = ptrDiff(ssh->getSpace(0), ssh->getCpuBase()); - if (globalBindlessSsh) { - bindlessSshBaseOffset += ptrDiff(ssh->getGraphicsAllocation()->getGpuAddress(), ssh->getGraphicsAllocation()->getGpuBaseAddress()); - } - - DEBUG_BREAK_IF(bindingTableStateCount > 0u); - - // Allocate space for new ssh data - auto dstSurfaceState = ssh->getSpace(sshHeapSize); - memcpy_s(dstSurfaceState, sshHeapSize, args.dispatchInterface->getSurfaceStateHeapData(), sshHeapSize); - - args.dispatchInterface->patchBindlessOffsetsInCrossThreadData(bindlessSshBaseOffset); - } - } - idd.setBindingTablePointer(bindingTablePointer); - - PreemptionHelper::programInterfaceDescriptorDataPreemption(&idd, args.preemptionMode); - - uint32_t samplerStateOffset = 0; - uint32_t samplerCount = 0; - - if (kernelDescriptor.payloadMappings.samplerTable.numSamplers > 0) { - auto dsHeap = args.dynamicStateHeap; - if (dsHeap == nullptr) { - dsHeap = container.getIndirectHeap(HeapType::dynamicState); - auto dshSizeRequired = NEO::EncodeDispatchKernel::getSizeRequiredDsh(kernelDescriptor, container.getNumIddPerBlock()); - if (dsHeap->getAvailableSpace() <= dshSizeRequired) { - dsHeap = container.getHeapWithRequiredSizeAndAlignment(HeapType::dynamicState, dsHeap->getMaxAvailableSpace(), NEO::EncodeDispatchKernel::getDefaultDshAlignment()); - } - } - UNRECOVERABLE_IF(!dsHeap); - - samplerCount = kernelDescriptor.payloadMappings.samplerTable.numSamplers; - samplerStateOffset = EncodeStates::copySamplerState(dsHeap, kernelDescriptor.payloadMappings.samplerTable.tableOffset, - kernelDescriptor.payloadMappings.samplerTable.numSamplers, - kernelDescriptor.payloadMappings.samplerTable.borderColor, - args.dispatchInterface->getDynamicStateHeapData(), - args.device->getBindlessHeapsHelper(), args.device->getRootDeviceEnvironment()); - } - - idd.setSamplerStatePointer(samplerStateOffset); - if (!isBindlessKernel) { - EncodeDispatchKernel::adjustBindingTablePrefetch(idd, samplerCount, bindingTableStateCount); - } - - EncodeDispatchKernel::setGrfInfo(&idd, kernelDescriptor.kernelAttributes.numGrfRequired, sizeCrossThreadData, - sizePerThreadData, rootDeviceEnvironment); - - uint32_t sizeThreadData = sizePerThreadDataForWholeGroup + sizeCrossThreadData; - bool isHwLocalIdGeneration = false; - uint32_t sizeForImplicitArgsPatching = NEO::ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, isHwLocalIdGeneration, rootDeviceEnvironment); - uint32_t iohRequiredSize = sizeThreadData + sizeForImplicitArgsPatching; - uint64_t offsetThreadData = 0u; - { - auto heapIndirect = container.getIndirectHeap(HeapType::indirectObject); - UNRECOVERABLE_IF(!(heapIndirect)); - heapIndirect->align(Family::cacheLineSize); - void *ptr = nullptr; - if (args.isKernelDispatchedFromImmediateCmdList) { - ptr = container.getHeapWithRequiredSizeAndAlignment(HeapType::indirectObject, iohRequiredSize, DefaultWalkerType::INDIRECTDATASTARTADDRESS_ALIGN_SIZE)->getSpace(iohRequiredSize); - } else { - ptr = container.getHeapSpaceAllowGrow(HeapType::indirectObject, iohRequiredSize); - } - UNRECOVERABLE_IF(!(ptr)); - offsetThreadData = heapIndirect->getHeapGpuStartOffset() + static_cast(heapIndirect->getUsed() - sizeThreadData); - - uint64_t implicitArgsGpuVA = 0u; - if (pImplicitArgs) { - implicitArgsGpuVA = heapIndirect->getGraphicsAllocation()->getGpuAddress() + static_cast(heapIndirect->getUsed() - iohRequiredSize); - auto implicitArgsCrossThreadPtr = ptrOffset(const_cast(reinterpret_cast(args.dispatchInterface->getCrossThreadData())), kernelDescriptor.payloadMappings.implicitArgs.implicitArgsBuffer); - *implicitArgsCrossThreadPtr = implicitArgsGpuVA; - - ptr = NEO::ImplicitArgsHelper::patchImplicitArgs(ptr, *pImplicitArgs, kernelDescriptor, {}, rootDeviceEnvironment, nullptr); - } - - memcpy_s(ptr, sizeCrossThreadData, - args.dispatchInterface->getCrossThreadData(), sizeCrossThreadData); - - if (args.isIndirect) { - auto crossThreadDataGpuVA = heapIndirect->getGraphicsAllocation()->getGpuAddress() + heapIndirect->getUsed() - sizeThreadData; - EncodeIndirectParams::encode(container, crossThreadDataGpuVA, args.dispatchInterface, implicitArgsGpuVA); - } - - ptr = ptrOffset(ptr, sizeCrossThreadData); - memcpy_s(ptr, sizePerThreadDataForWholeGroup, - args.dispatchInterface->getPerThreadData(), sizePerThreadDataForWholeGroup); - } - - uint32_t numIDD = 0u; - void *iddPtr = getInterfaceDescriptor(container, args.dynamicStateHeap, numIDD); - - auto slmSizeNew = args.dispatchInterface->getSlmTotalSize(); - bool dirtyHeaps = container.isAnyHeapDirty(); - bool flush = container.slmSizeRef() != slmSizeNew || dirtyHeaps || args.requiresUncachedMocs; - - if (flush) { - PipeControlArgs syncArgs; - syncArgs.dcFlushEnable = args.dcFlushEnable; - if (dirtyHeaps) { - syncArgs.hdcPipelineFlush = true; - } - MemorySynchronizationCommands::addSingleBarrier(*container.getCommandStream(), syncArgs); - - if (dirtyHeaps || args.requiresUncachedMocs) { - STATE_BASE_ADDRESS sba; - auto gmmHelper = container.getDevice()->getGmmHelper(); - uint32_t statelessMocsIndex = - args.requiresUncachedMocs ? (gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER_CACHELINE_MISALIGNED) >> 1) : (gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER) >> 1); - auto l1CachePolicy = container.l1CachePolicyDataRef()->getL1CacheValue(false); - auto l1CachePolicyDebuggerActive = container.l1CachePolicyDataRef()->getL1CacheValue(true); - EncodeStateBaseAddressArgs encodeStateBaseAddressArgs = { - &container, // container - sba, // sbaCmd - nullptr, // sbaProperties - statelessMocsIndex, // statelessMocsIndex - l1CachePolicy, // l1CachePolicy - l1CachePolicyDebuggerActive, // l1CachePolicyDebuggerActive - false, // multiOsContextCapable - args.isRcs, // isRcs - container.doubleSbaWaRef(), // doubleSbaWa - false, // heaplessModeEnabled - }; - EncodeStateBaseAddress::encode(encodeStateBaseAddressArgs); - container.setDirtyStateForAllHeaps(false); - args.requiresUncachedMocs = false; - } - - if (container.slmSizeRef() != slmSizeNew) { - EncodeL3State::encode(container, slmSizeNew != 0u); - container.slmSizeRef() = slmSizeNew; - } - } - - if (numIDD == 0 || flush) { - EncodeMediaInterfaceDescriptorLoad::encode(container, args.dynamicStateHeap); - } - - cmd.setIndirectDataStartAddress(static_cast(offsetThreadData)); - cmd.setIndirectDataLength(sizeThreadData); - cmd.setInterfaceDescriptorOffset(numIDD); - - EncodeDispatchKernel::encodeThreadData(cmd, - nullptr, - threadGroupDims, - args.dispatchInterface->getGroupSize(), - kernelDescriptor.kernelAttributes.simdSize, - kernelDescriptor.kernelAttributes.numLocalIdChannels, - numThreadsPerThreadGroup, - args.dispatchInterface->getThreadExecutionMask(), - true, - false, - args.isIndirect, - args.dispatchInterface->getRequiredWorkgroupOrder(), - rootDeviceEnvironment); - - cmd.setPredicateEnable(args.isPredicate); - - auto threadGroupCount = cmd.getThreadGroupIdXDimension() * cmd.getThreadGroupIdYDimension() * cmd.getThreadGroupIdZDimension(); - EncodeDispatchKernel::encodeThreadGroupDispatch(idd, *args.device, hwInfo, threadGroupDims, threadGroupCount, kernelDescriptor.kernelAttributes.numGrfRequired, numThreadsPerThreadGroup, cmd); - - EncodeWalkerArgs walkerArgs{ - .kernelExecutionType = KernelExecutionType::defaultType, - .requiredDispatchWalkOrder = args.requiredDispatchWalkOrder, - .localRegionSize = args.localRegionSize, - .maxFrontEndThreads = args.device->getDeviceInfo().maxFrontEndThreads, - .requiredSystemFence = args.requiresSystemMemoryFence(), - .hasSample = false}; - - EncodeDispatchKernel::encodeAdditionalWalkerFields(rootDeviceEnvironment, cmd, walkerArgs); - EncodeDispatchKernel::encodeWalkerPostSyncFields(cmd, walkerArgs); - EncodeDispatchKernel::template encodeComputeDispatchAllWalker(cmd, nullptr, rootDeviceEnvironment, walkerArgs); - - memcpy_s(iddPtr, sizeof(idd), &idd, sizeof(idd)); - - if (NEO::PauseOnGpuProperties::pauseModeAllowed(NEO::debugManager.flags.PauseOnEnqueue.get(), args.device->debugExecutionCounter.load(), NEO::PauseOnGpuProperties::PauseMode::BeforeWorkload)) { - void *commandBuffer = listCmdBufferStream->getSpace(MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(args.device->getRootDeviceEnvironment(), false)); - args.additionalCommands->push_back(commandBuffer); - - EncodeSemaphore::applyMiSemaphoreWaitCommand(*listCmdBufferStream, *args.additionalCommands); - } - - auto buffer = listCmdBufferStream->getSpaceForCmd(); - *buffer = cmd; - - { - auto mediaStateFlush = listCmdBufferStream->getSpaceForCmd(); - *mediaStateFlush = Family::cmdInitMediaStateFlush; - } - - args.partitionCount = 1; - - if (NEO::PauseOnGpuProperties::pauseModeAllowed(NEO::debugManager.flags.PauseOnEnqueue.get(), args.device->debugExecutionCounter.load(), NEO::PauseOnGpuProperties::PauseMode::AfterWorkload)) { - void *commandBuffer = listCmdBufferStream->getSpace(MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(args.device->getRootDeviceEnvironment(), false)); - args.additionalCommands->push_back(commandBuffer); - - EncodeSemaphore::applyMiSemaphoreWaitCommand(*listCmdBufferStream, *args.additionalCommands); - } -} - -template -void EncodeMediaInterfaceDescriptorLoad::encode(CommandContainer &container, IndirectHeap *childDsh) { - using MEDIA_STATE_FLUSH = typename Family::MEDIA_STATE_FLUSH; - using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename Family::MEDIA_INTERFACE_DESCRIPTOR_LOAD; - void *heapBase = nullptr; - if (childDsh != nullptr) { - heapBase = childDsh->getCpuBase(); - } else { - heapBase = container.getIndirectHeap(HeapType::dynamicState)->getCpuBase(); - } - - auto mediaStateFlush = container.getCommandStream()->getSpaceForCmd(); - *mediaStateFlush = Family::cmdInitMediaStateFlush; - - auto iddOffset = static_cast(ptrDiff(container.getIddBlock(), heapBase)); - - MEDIA_INTERFACE_DESCRIPTOR_LOAD cmd = Family::cmdInitMediaInterfaceDescriptorLoad; - cmd.setInterfaceDescriptorDataStartAddress(iddOffset); - cmd.setInterfaceDescriptorTotalLength(sizeof(INTERFACE_DESCRIPTOR_DATA) * container.getNumIddPerBlock()); - - auto buffer = container.getCommandStream()->getSpace(sizeof(cmd)); - *(decltype(cmd) *)buffer = cmd; -} - -template -inline bool EncodeDispatchKernel::isRuntimeLocalIdsGenerationRequired(uint32_t activeChannels, - const size_t *lws, - std::array walkOrder, - bool requireInputWalkOrder, - uint32_t &requiredWalkOrder, - uint32_t simd) { - requiredWalkOrder = 0u; - return true; -} - -template -template -void EncodeDispatchKernel::encodeThreadData(WalkerType &walkerCmd, - const uint32_t *startWorkGroup, - const uint32_t *numWorkGroups, - const uint32_t *workGroupSizes, - uint32_t simd, - uint32_t localIdDimensions, - uint32_t threadsPerThreadGroup, - uint32_t threadExecutionMask, - bool localIdsGenerationByRuntime, - bool inlineDataProgrammingRequired, - bool isIndirect, - uint32_t requiredWorkGroupOrder, - const RootDeviceEnvironment &rootDeviceEnvironment) { - - if (isIndirect) { - walkerCmd.setIndirectParameterEnable(true); - } else { - walkerCmd.setThreadGroupIdXDimension(static_cast(numWorkGroups[0])); - walkerCmd.setThreadGroupIdYDimension(static_cast(numWorkGroups[1])); - walkerCmd.setThreadGroupIdZDimension(static_cast(numWorkGroups[2])); - } - - if (startWorkGroup) { - walkerCmd.setThreadGroupIdStartingX(static_cast(startWorkGroup[0])); - walkerCmd.setThreadGroupIdStartingY(static_cast(startWorkGroup[1])); - walkerCmd.setThreadGroupIdStartingResumeZ(static_cast(startWorkGroup[2])); - } - - walkerCmd.setSimdSize(getSimdConfig(simd)); - - auto localWorkSize = static_cast(workGroupSizes[0] * workGroupSizes[1] * workGroupSizes[2]); - if (threadsPerThreadGroup == 0) { - threadsPerThreadGroup = getThreadsPerWG(simd, localWorkSize); - } - walkerCmd.setThreadWidthCounterMaximum(threadsPerThreadGroup); - - uint64_t executionMask = threadExecutionMask; - if (executionMask == 0) { - auto remainderSimdLanes = localWorkSize & (simd - 1); - executionMask = maxNBitValue(remainderSimdLanes); - if (!executionMask) - executionMask = ~executionMask; - } - - constexpr uint32_t maxDword = std::numeric_limits::max(); - walkerCmd.setRightExecutionMask(static_cast(executionMask)); - walkerCmd.setBottomExecutionMask(maxDword); -} - -template -template -void EncodeDispatchKernel::programBarrierEnable(InterfaceDescriptorType &interfaceDescriptor, - const KernelDescriptor &kernelDescriptor, - const HardwareInfo &hwInfo) { - interfaceDescriptor.setBarrierEnable(kernelDescriptor.kernelAttributes.barrierCount); -} - -template -template -inline void EncodeDispatchKernel::encodeAdditionalWalkerFields(const RootDeviceEnvironment &rootDeviceEnvironment, WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs) {} - -template -template -inline void EncodeDispatchKernel::encodeWalkerPostSyncFields(WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs) {} - -template -template -inline void EncodeDispatchKernel::encodeComputeDispatchAllWalker(WalkerType &walkerCmd, const InterfaceDescriptorType *idd, const RootDeviceEnvironment &rootDeviceEnvironment, const EncodeWalkerArgs &walkerArgs) {} - -template -template -void EncodeDispatchKernel::setupPreferredSlmSize(InterfaceDescriptorType *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy) {} - -template -inline bool EncodeDispatchKernel::isDshNeeded(const DeviceInfo &deviceInfo) { - return true; -} - -template -void EncodeStateBaseAddress::setSbaAddressesForDebugger(NEO::Debugger::SbaAddresses &sbaAddress, const STATE_BASE_ADDRESS &sbaCmd) { - sbaAddress.indirectObjectBaseAddress = sbaCmd.getIndirectObjectBaseAddress(); - sbaAddress.bindlessSurfaceStateBaseAddress = sbaCmd.getBindlessSurfaceStateBaseAddress(); - sbaAddress.dynamicStateBaseAddress = sbaCmd.getDynamicStateBaseAddress(); - sbaAddress.generalStateBaseAddress = sbaCmd.getGeneralStateBaseAddress(); - sbaAddress.instructionBaseAddress = sbaCmd.getInstructionBaseAddress(); - sbaAddress.surfaceStateBaseAddress = sbaCmd.getSurfaceStateBaseAddress(); -} - -template -void EncodeStateBaseAddress::encode(EncodeStateBaseAddressArgs &args) { - auto &device = *args.container->getDevice(); - - if (args.container->isAnyHeapDirty()) { - EncodeWA::encodeAdditionalPipelineSelect(*args.container->getCommandStream(), {}, true, device.getRootDeviceEnvironment(), args.isRcs); - } - - auto gmmHelper = device.getGmmHelper(); - - auto dsh = args.container->isHeapDirty(HeapType::dynamicState) ? args.container->getIndirectHeap(HeapType::dynamicState) : nullptr; - auto ioh = args.container->isHeapDirty(HeapType::indirectObject) ? args.container->getIndirectHeap(HeapType::indirectObject) : nullptr; - auto ssh = args.container->isHeapDirty(HeapType::surfaceState) ? args.container->getIndirectHeap(HeapType::surfaceState) : nullptr; - auto isDebuggerActive = device.getDebugger() != nullptr; - uint64_t globalHeapsBase = 0; - uint64_t bindlessSurfStateBase = 0; - bool useGlobalSshAndDsh = false; - - if (device.getBindlessHeapsHelper()) { - bindlessSurfStateBase = device.getBindlessHeapsHelper()->getGlobalHeapsBase(); - globalHeapsBase = device.getBindlessHeapsHelper()->getGlobalHeapsBase(); - useGlobalSshAndDsh = true; - } - - StateBaseAddressHelperArgs stateBaseAddressHelperArgs = { - 0, // generalStateBaseAddress - args.container->getIndirectObjectHeapBaseAddress(), // indirectObjectHeapBaseAddress - args.container->getInstructionHeapBaseAddress(), // instructionHeapBaseAddress - globalHeapsBase, // globalHeapsBaseAddress - 0, // surfaceStateBaseAddress - bindlessSurfStateBase, // bindlessSurfaceStateBaseAddress - &args.sbaCmd, // stateBaseAddressCmd - args.sbaProperties, // sbaProperties - dsh, // dsh - ioh, // ioh - ssh, // ssh - gmmHelper, // gmmHelper - args.statelessMocsIndex, // statelessMocsIndex - args.l1CachePolicy, // l1CachePolicy - args.l1CachePolicyDebuggerActive, // l1CachePolicyDebuggerActive - NEO::MemoryCompressionState::notApplicable, // memoryCompressionState - false, // setInstructionStateBaseAddress - false, // setGeneralStateBaseAddress - useGlobalSshAndDsh, // useGlobalHeapsBaseAddress - false, // isMultiOsContextCapable - false, // areMultipleSubDevicesInContext - false, // overrideSurfaceStateBaseAddress - isDebuggerActive, // isDebuggerActive - args.doubleSbaWa, // doubleSbaWa - args.heaplessModeEnabled // heaplessModeEnabled - }; - - StateBaseAddressHelper::programStateBaseAddressIntoCommandStream(stateBaseAddressHelperArgs, - *args.container->getCommandStream()); - - EncodeWA::encodeAdditionalPipelineSelect(*args.container->getCommandStream(), {}, false, device.getRootDeviceEnvironment(), args.isRcs); -} - -template -size_t EncodeStateBaseAddress::getRequiredSizeForStateBaseAddress(Device &device, CommandContainer &container, bool isRcs) { - return sizeof(typename Family::STATE_BASE_ADDRESS) + 2 * EncodeWA::getAdditionalPipelineSelectSize(device, isRcs); -} - -template -void EncodeMiFlushDW::adjust(MI_FLUSH_DW *miFlushDwCmd, const ProductHelper &productHelper) {} - -template -inline void EncodeWA::addPipeControlPriorToNonPipelinedStateCommand(LinearStream &commandStream, PipeControlArgs args, - const RootDeviceEnvironment &rootDeviceEnvironment, bool isRcs) { - MemorySynchronizationCommands::addSingleBarrier(commandStream, args); -} - -template -inline void EncodeWA::adjustCompressionFormatForPlanarImage(uint32_t &compressionFormat, int plane) { -} - -template -void EncodeSurfaceState::setCoherencyType(R_SURFACE_STATE *surfaceState, COHERENCY_TYPE coherencyType) { - surfaceState->setCoherencyType(coherencyType); -} - -template -void EncodeSemaphore::programMiSemaphoreWait(MI_SEMAPHORE_WAIT *cmd, - uint64_t compareAddress, - uint64_t compareData, - COMPARE_OPERATION compareMode, - bool registerPollMode, - bool waitMode, - bool useQwordData, - bool indirect, - bool switchOnUnsuccessful) { - constexpr uint64_t upper32b = static_cast(std::numeric_limits::max()) << 32; - UNRECOVERABLE_IF(useQwordData || (compareData & upper32b)); - UNRECOVERABLE_IF(indirect); - - MI_SEMAPHORE_WAIT localCmd = Family::cmdInitMiSemaphoreWait; - localCmd.setCompareOperation(compareMode); - localCmd.setSemaphoreDataDword(static_cast(compareData)); - localCmd.setSemaphoreGraphicsAddress(compareAddress); - localCmd.setWaitMode(waitMode ? MI_SEMAPHORE_WAIT::WAIT_MODE::WAIT_MODE_POLLING_MODE : MI_SEMAPHORE_WAIT::WAIT_MODE::WAIT_MODE_SIGNAL_MODE); - - *cmd = localCmd; -} - -template -void EncodeEnableRayTracing::programEnableRayTracing(LinearStream &commandStream, uint64_t backBuffer) { -} - -template -inline void EncodeStoreMemory::programStoreDataImm(MI_STORE_DATA_IMM *cmdBuffer, - uint64_t gpuAddress, - uint32_t dataDword0, - uint32_t dataDword1, - bool storeQword, - bool workloadPartitionOffset) { - MI_STORE_DATA_IMM storeDataImmediate = Family::cmdInitStoreDataImm; - storeDataImmediate.setAddress(gpuAddress); - storeDataImmediate.setStoreQword(storeQword); - storeDataImmediate.setDataDword0(dataDword0); - if (storeQword) { - storeDataImmediate.setDataDword1(dataDword1); - storeDataImmediate.setDwordLength(MI_STORE_DATA_IMM::DWORD_LENGTH::DWORD_LENGTH_STORE_QWORD); - } else { - storeDataImmediate.setDwordLength(MI_STORE_DATA_IMM::DWORD_LENGTH::DWORD_LENGTH_STORE_DWORD); - } - EncodeStoreMemory::encodeForceCompletionCheck(storeDataImmediate); - - *cmdBuffer = storeDataImmediate; -} - -template -template -void EncodeDispatchKernel::setupPostSyncMocs(WalkerType &walkerCmd, const RootDeviceEnvironment &rootDeviceEnvironment, bool dcFlush) {} - -template -template -void EncodeDispatchKernel::setupPostSyncForRegularEvent(WalkerType &walkerCmd, const EncodeDispatchKernelArgs &args) {} - -template -template -void EncodeDispatchKernel::setupPostSyncForInOrderExec(WalkerType &walkerCmd, const EncodeDispatchKernelArgs &args) {} - -template -template -void EncodeDispatchKernel::adjustWalkOrder(WalkerType &walkerCmd, uint32_t requiredWorkGroupOrder, const RootDeviceEnvironment &rootDeviceEnvironment) {} - -template -size_t EncodeDispatchKernel::additionalSizeRequiredDsh(uint32_t iddCount) { - return iddCount * sizeof(typename Family::INTERFACE_DESCRIPTOR_DATA); -} - -template -inline size_t EncodeDispatchKernel::getInlineDataOffset(EncodeDispatchKernelArgs &args) { - return 0; -} - -template -template -void EncodeDispatchKernel::forceComputeWalkerPostSyncFlushWithWrite(WalkerType &walkerCmd) { -} - -template -uint32_t EncodeDispatchKernel::alignSlmSize(uint32_t slmSize) { - if (slmSize == 0u) { - return 0u; - } - slmSize = std::max(slmSize, 1024u); - slmSize = Math::nextPowerOfTwo(slmSize); - UNRECOVERABLE_IF(slmSize > 64u * MemoryConstants::kiloByte); - return slmSize; -} - -template -uint32_t EncodeDispatchKernel::computeSlmValues(const HardwareInfo &hwInfo, uint32_t slmSize) { - auto value = std::max(slmSize, 1024u); - value = Math::nextPowerOfTwo(value); - value = Math::getMinLsbSet(value); - value = value - 9; - DEBUG_BREAK_IF(value > 7); - return value * !!slmSize; -} - -template -bool EncodeDispatchKernel::singleTileExecImplicitScalingRequired(bool cooperativeKernel) { - return cooperativeKernel; -} - -template -size_t EncodeStates::getSshHeapSize() { - return 64 * MemoryConstants::kiloByte; -} - -template -void InOrderPatchCommandHelpers::PatchCmd::patchComputeWalker(uint64_t appendCounterValue) { - UNRECOVERABLE_IF(true); -} - -template -template -void EncodeDispatchKernel::overrideDefaultValues(WalkerType &walkerCmd, InterfaceDescriptorType &interfaceDescriptor) { -} - -template -template -void EncodeDispatchKernel::encodeThreadGroupDispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, - const uint32_t *threadGroupDimensions, const uint32_t threadGroupCount, const uint32_t grfCount, const uint32_t threadsPerThreadGroup, - WalkerType &walkerCmd) { -} - -template -size_t EncodeDispatchKernel::getScratchPtrOffsetOfImplicitArgs() { - return 0; -} - -template -void EncodeSurfaceState::setPitchForScratch(R_SURFACE_STATE *surfaceState, uint32_t pitch, const ProductHelper &productHelper) { - surfaceState->setSurfacePitch(pitch); -} - -template -uint32_t EncodeSurfaceState::getPitchForScratchInBytes(R_SURFACE_STATE *surfaceState, const ProductHelper &productHelper) { - return surfaceState->getSurfacePitch(); -} - -template -void EncodeSemaphore::appendSemaphoreCommand(MI_SEMAPHORE_WAIT &cmd, uint64_t compareData, bool indirect, bool useQwordData, bool switchOnUnsuccessful) { - constexpr uint64_t upper32b = static_cast(std::numeric_limits::max()) << 32; - UNRECOVERABLE_IF(useQwordData || (compareData & upper32b)); -} - -template -template -void EncodeDispatchKernel::setScratchAddress(uint64_t &scratchAddress, uint32_t requiredScratchSlot0Size, uint32_t requiredScratchSlot1Size, IndirectHeap *ssh, CommandStreamReceiver &submissionCsr) { -} - -template -template -void EncodeDispatchKernel::encodeEuSchedulingPolicy(InterfaceDescriptorType *pInterfaceDescriptor, const KernelDescriptor &kernelDesc, int32_t defaultPipelinedThreadArbitrationPolicy) { -} - -template -template -void EncodeDispatchKernel::setWalkerRegionSettings(WalkerType &walkerCmd, const NEO::Device &device, uint32_t partitionCount, uint32_t workgroupSize, uint32_t threadGroupCount, uint32_t maxWgCountPerTile, bool requiredDispatchWalkOrder) {} - -template -template -void EncodeDispatchKernel::adjustTimestampPacket(WalkerType &walkerCmd, const EncodeDispatchKernelArgs &args) {} -} // namespace NEO diff --git a/shared/source/gen12lp/command_encoder_gen12lp.cpp b/shared/source/gen12lp/command_encoder_gen12lp.cpp index 4e032d1f6a..9cc3365903 100644 --- a/shared/source/gen12lp/command_encoder_gen12lp.cpp +++ b/shared/source/gen12lp/command_encoder_gen12lp.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2024 Intel Corporation + * Copyright (C) 2020-2025 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -7,20 +7,37 @@ #include "shared/source/command_container/command_encoder.h" #include "shared/source/command_container/command_encoder.inl" -#include "shared/source/command_container/command_encoder_bdw_and_later.inl" #include "shared/source/command_container/command_encoder_from_gen12lp_to_xe2_hpg.inl" #include "shared/source/command_container/command_encoder_gen12lp_and_xe_hpg.inl" #include "shared/source/command_container/command_encoder_pre_xe2_hpg_core.inl" #include "shared/source/command_container/command_encoder_tgllp_and_later.inl" +#include "shared/source/command_container/encode_surface_state.h" +#include "shared/source/command_stream/linear_stream.h" +#include "shared/source/command_stream/memory_compression_state.h" +#include "shared/source/command_stream/preemption.h" #include "shared/source/command_stream/stream_properties.h" +#include "shared/source/execution_environment/execution_environment.h" #include "shared/source/gen12lp/hw_cmds_base.h" #include "shared/source/gen12lp/reg_configs.h" #include "shared/source/gmm_helper/gmm_helper.h" +#include "shared/source/helpers/api_specific_config.h" +#include "shared/source/helpers/cache_policy.h" +#include "shared/source/helpers/gfx_core_helper.h" +#include "shared/source/helpers/in_order_cmd_helpers.h" +#include "shared/source/helpers/pause_on_gpu_properties.h" +#include "shared/source/helpers/pipe_control_args.h" +#include "shared/source/helpers/pipeline_select_args.h" #include "shared/source/helpers/preamble.h" +#include "shared/source/helpers/simd_helper.h" +#include "shared/source/helpers/state_base_address.h" +#include "shared/source/kernel/dispatch_kernel_encoder_interface.h" +#include "shared/source/kernel/implicit_args_helper.h" #include "shared/source/release_helper/release_helper.h" #include "encode_surface_state_args.h" +#include + using Family = NEO::Gen12LpFamily; #include "shared/source/command_container/command_encoder_heap_addressing.inl" @@ -28,6 +45,667 @@ using Family = NEO::Gen12LpFamily; namespace NEO { +template +template +void EncodeDispatchKernel::setGrfInfo(InterfaceDescriptorType *pInterfaceDescriptor, uint32_t grfCount, + const size_t &sizeCrossThreadData, const size_t &sizePerThreadData, + const RootDeviceEnvironment &rootDeviceEnvironment) { + auto grfSize = sizeof(typename Family::GRF); + DEBUG_BREAK_IF((sizeCrossThreadData % grfSize) != 0); + auto numGrfCrossThreadData = static_cast(sizeCrossThreadData / grfSize); + DEBUG_BREAK_IF(numGrfCrossThreadData == 0); + pInterfaceDescriptor->setCrossThreadConstantDataReadLength(numGrfCrossThreadData); + + DEBUG_BREAK_IF((sizePerThreadData % grfSize) != 0); + auto numGrfPerThreadData = static_cast(sizePerThreadData / grfSize); + + // at least 1 GRF of perThreadData for each thread in a thread group when sizeCrossThreadData != 0 + numGrfPerThreadData = std::max(numGrfPerThreadData, 1u); + pInterfaceDescriptor->setConstantIndirectUrbEntryReadLength(numGrfPerThreadData); +} + +template +template +void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDispatchKernelArgs &args) { + + using MEDIA_STATE_FLUSH = typename Family::MEDIA_STATE_FLUSH; + using STATE_BASE_ADDRESS = typename Family::STATE_BASE_ADDRESS; + + auto &kernelDescriptor = args.dispatchInterface->getKernelDescriptor(); + auto sizeCrossThreadData = args.dispatchInterface->getCrossThreadDataSize(); + auto sizePerThreadData = args.dispatchInterface->getPerThreadDataSize(); + auto sizePerThreadDataForWholeGroup = args.dispatchInterface->getPerThreadDataSizeForWholeThreadGroup(); + auto pImplicitArgs = args.dispatchInterface->getImplicitArgs(); + + auto &hwInfo = args.device->getHardwareInfo(); + auto &rootDeviceEnvironment = args.device->getRootDeviceEnvironment(); + + LinearStream *listCmdBufferStream = container.getCommandStream(); + + auto threadGroupDims = static_cast(args.threadGroupDimensions); + + DefaultWalkerType cmd = Family::cmdInitGpgpuWalker; + auto idd = Family::cmdInitInterfaceDescriptorData; + { + auto alloc = args.dispatchInterface->getIsaAllocation(); + UNRECOVERABLE_IF(nullptr == alloc); + auto offset = alloc->getGpuAddressToPatch() + args.dispatchInterface->getIsaOffsetInParentAllocation(); + idd.setKernelStartPointer(offset); + idd.setKernelStartPointerHigh(0u); + } + + if (args.dispatchInterface->getKernelDescriptor().kernelAttributes.flags.usesAssert && args.device->getL0Debugger() != nullptr) { + idd.setSoftwareExceptionEnable(1); + } + + auto numThreadsPerThreadGroup = args.dispatchInterface->getNumThreadsPerThreadGroup(); + idd.setNumberOfThreadsInGpgpuThreadGroup(numThreadsPerThreadGroup); + + EncodeDispatchKernel::programBarrierEnable(idd, + kernelDescriptor, + hwInfo); + auto slmSize = EncodeDispatchKernel::computeSlmValues(hwInfo, args.dispatchInterface->getSlmTotalSize()); + idd.setSharedLocalMemorySize(slmSize); + + uint32_t bindingTableStateCount = kernelDescriptor.payloadMappings.bindingTable.numEntries; + uint32_t bindingTablePointer = 0u; + bool isBindlessKernel = NEO::KernelDescriptor::isBindlessAddressingKernel(kernelDescriptor); + + if (!isBindlessKernel) { + container.prepareBindfulSsh(); + if (bindingTableStateCount > 0u) { + auto ssh = args.surfaceStateHeap; + if (ssh == nullptr) { + ssh = container.getHeapWithRequiredSizeAndAlignment(HeapType::surfaceState, args.dispatchInterface->getSurfaceStateHeapDataSize(), NEO::EncodeDispatchKernel::getDefaultSshAlignment()); + } + bindingTablePointer = static_cast(EncodeSurfaceState::pushBindingTableAndSurfaceStates( + *ssh, + args.dispatchInterface->getSurfaceStateHeapData(), + args.dispatchInterface->getSurfaceStateHeapDataSize(), bindingTableStateCount, + kernelDescriptor.payloadMappings.bindingTable.tableOffset)); + } + } else { + bool globalBindlessSsh = args.device->getBindlessHeapsHelper() != nullptr; + auto sshHeapSize = args.dispatchInterface->getSurfaceStateHeapDataSize(); + + if (sshHeapSize > 0u) { + auto ssh = args.surfaceStateHeap; + if (ssh == nullptr) { + container.prepareBindfulSsh(); + ssh = container.getHeapWithRequiredSizeAndAlignment(HeapType::surfaceState, sshHeapSize, NEO::EncodeDispatchKernel::getDefaultSshAlignment()); + } + uint64_t bindlessSshBaseOffset = ptrDiff(ssh->getSpace(0), ssh->getCpuBase()); + if (globalBindlessSsh) { + bindlessSshBaseOffset += ptrDiff(ssh->getGraphicsAllocation()->getGpuAddress(), ssh->getGraphicsAllocation()->getGpuBaseAddress()); + } + + DEBUG_BREAK_IF(bindingTableStateCount > 0u); + + // Allocate space for new ssh data + auto dstSurfaceState = ssh->getSpace(sshHeapSize); + memcpy_s(dstSurfaceState, sshHeapSize, args.dispatchInterface->getSurfaceStateHeapData(), sshHeapSize); + + args.dispatchInterface->patchBindlessOffsetsInCrossThreadData(bindlessSshBaseOffset); + } + } + idd.setBindingTablePointer(bindingTablePointer); + + PreemptionHelper::programInterfaceDescriptorDataPreemption(&idd, args.preemptionMode); + + uint32_t samplerStateOffset = 0; + uint32_t samplerCount = 0; + + if (kernelDescriptor.payloadMappings.samplerTable.numSamplers > 0) { + auto dsHeap = args.dynamicStateHeap; + if (dsHeap == nullptr) { + dsHeap = container.getIndirectHeap(HeapType::dynamicState); + auto dshSizeRequired = NEO::EncodeDispatchKernel::getSizeRequiredDsh(kernelDescriptor, container.getNumIddPerBlock()); + if (dsHeap->getAvailableSpace() <= dshSizeRequired) { + dsHeap = container.getHeapWithRequiredSizeAndAlignment(HeapType::dynamicState, dsHeap->getMaxAvailableSpace(), NEO::EncodeDispatchKernel::getDefaultDshAlignment()); + } + } + UNRECOVERABLE_IF(!dsHeap); + + samplerCount = kernelDescriptor.payloadMappings.samplerTable.numSamplers; + samplerStateOffset = EncodeStates::copySamplerState(dsHeap, kernelDescriptor.payloadMappings.samplerTable.tableOffset, + kernelDescriptor.payloadMappings.samplerTable.numSamplers, + kernelDescriptor.payloadMappings.samplerTable.borderColor, + args.dispatchInterface->getDynamicStateHeapData(), + args.device->getBindlessHeapsHelper(), args.device->getRootDeviceEnvironment()); + } + + idd.setSamplerStatePointer(samplerStateOffset); + if (!isBindlessKernel) { + EncodeDispatchKernel::adjustBindingTablePrefetch(idd, samplerCount, bindingTableStateCount); + } + + EncodeDispatchKernel::setGrfInfo(&idd, kernelDescriptor.kernelAttributes.numGrfRequired, sizeCrossThreadData, + sizePerThreadData, rootDeviceEnvironment); + + uint32_t sizeThreadData = sizePerThreadDataForWholeGroup + sizeCrossThreadData; + bool isHwLocalIdGeneration = false; + uint32_t sizeForImplicitArgsPatching = NEO::ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, isHwLocalIdGeneration, rootDeviceEnvironment); + uint32_t iohRequiredSize = sizeThreadData + sizeForImplicitArgsPatching; + uint64_t offsetThreadData = 0u; + { + auto heapIndirect = container.getIndirectHeap(HeapType::indirectObject); + UNRECOVERABLE_IF(!(heapIndirect)); + heapIndirect->align(Family::cacheLineSize); + void *ptr = nullptr; + if (args.isKernelDispatchedFromImmediateCmdList) { + ptr = container.getHeapWithRequiredSizeAndAlignment(HeapType::indirectObject, iohRequiredSize, DefaultWalkerType::INDIRECTDATASTARTADDRESS_ALIGN_SIZE)->getSpace(iohRequiredSize); + } else { + ptr = container.getHeapSpaceAllowGrow(HeapType::indirectObject, iohRequiredSize); + } + UNRECOVERABLE_IF(!(ptr)); + offsetThreadData = heapIndirect->getHeapGpuStartOffset() + static_cast(heapIndirect->getUsed() - sizeThreadData); + + uint64_t implicitArgsGpuVA = 0u; + if (pImplicitArgs) { + implicitArgsGpuVA = heapIndirect->getGraphicsAllocation()->getGpuAddress() + static_cast(heapIndirect->getUsed() - iohRequiredSize); + auto implicitArgsCrossThreadPtr = ptrOffset(const_cast(reinterpret_cast(args.dispatchInterface->getCrossThreadData())), kernelDescriptor.payloadMappings.implicitArgs.implicitArgsBuffer); + *implicitArgsCrossThreadPtr = implicitArgsGpuVA; + + ptr = NEO::ImplicitArgsHelper::patchImplicitArgs(ptr, *pImplicitArgs, kernelDescriptor, {}, rootDeviceEnvironment, nullptr); + } + + memcpy_s(ptr, sizeCrossThreadData, + args.dispatchInterface->getCrossThreadData(), sizeCrossThreadData); + + if (args.isIndirect) { + auto crossThreadDataGpuVA = heapIndirect->getGraphicsAllocation()->getGpuAddress() + heapIndirect->getUsed() - sizeThreadData; + EncodeIndirectParams::encode(container, crossThreadDataGpuVA, args.dispatchInterface, implicitArgsGpuVA); + } + + ptr = ptrOffset(ptr, sizeCrossThreadData); + memcpy_s(ptr, sizePerThreadDataForWholeGroup, + args.dispatchInterface->getPerThreadData(), sizePerThreadDataForWholeGroup); + } + + uint32_t numIDD = 0u; + void *iddPtr = getInterfaceDescriptor(container, args.dynamicStateHeap, numIDD); + + auto slmSizeNew = args.dispatchInterface->getSlmTotalSize(); + bool dirtyHeaps = container.isAnyHeapDirty(); + bool flush = container.slmSizeRef() != slmSizeNew || dirtyHeaps || args.requiresUncachedMocs; + + if (flush) { + PipeControlArgs syncArgs; + syncArgs.dcFlushEnable = args.dcFlushEnable; + if (dirtyHeaps) { + syncArgs.hdcPipelineFlush = true; + } + MemorySynchronizationCommands::addSingleBarrier(*container.getCommandStream(), syncArgs); + + if (dirtyHeaps || args.requiresUncachedMocs) { + STATE_BASE_ADDRESS sba; + auto gmmHelper = container.getDevice()->getGmmHelper(); + uint32_t statelessMocsIndex = + args.requiresUncachedMocs ? (gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER_CACHELINE_MISALIGNED) >> 1) : (gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER) >> 1); + auto l1CachePolicy = container.l1CachePolicyDataRef()->getL1CacheValue(false); + auto l1CachePolicyDebuggerActive = container.l1CachePolicyDataRef()->getL1CacheValue(true); + EncodeStateBaseAddressArgs encodeStateBaseAddressArgs = { + &container, // container + sba, // sbaCmd + nullptr, // sbaProperties + statelessMocsIndex, // statelessMocsIndex + l1CachePolicy, // l1CachePolicy + l1CachePolicyDebuggerActive, // l1CachePolicyDebuggerActive + false, // multiOsContextCapable + args.isRcs, // isRcs + container.doubleSbaWaRef(), // doubleSbaWa + false, // heaplessModeEnabled + }; + EncodeStateBaseAddress::encode(encodeStateBaseAddressArgs); + container.setDirtyStateForAllHeaps(false); + args.requiresUncachedMocs = false; + } + + if (container.slmSizeRef() != slmSizeNew) { + EncodeL3State::encode(container, slmSizeNew != 0u); + container.slmSizeRef() = slmSizeNew; + } + } + + if (numIDD == 0 || flush) { + EncodeMediaInterfaceDescriptorLoad::encode(container, args.dynamicStateHeap); + } + + cmd.setIndirectDataStartAddress(static_cast(offsetThreadData)); + cmd.setIndirectDataLength(sizeThreadData); + cmd.setInterfaceDescriptorOffset(numIDD); + + EncodeDispatchKernel::encodeThreadData(cmd, + nullptr, + threadGroupDims, + args.dispatchInterface->getGroupSize(), + kernelDescriptor.kernelAttributes.simdSize, + kernelDescriptor.kernelAttributes.numLocalIdChannels, + numThreadsPerThreadGroup, + args.dispatchInterface->getThreadExecutionMask(), + true, + false, + args.isIndirect, + args.dispatchInterface->getRequiredWorkgroupOrder(), + rootDeviceEnvironment); + + cmd.setPredicateEnable(args.isPredicate); + + auto threadGroupCount = cmd.getThreadGroupIdXDimension() * cmd.getThreadGroupIdYDimension() * cmd.getThreadGroupIdZDimension(); + EncodeDispatchKernel::encodeThreadGroupDispatch(idd, *args.device, hwInfo, threadGroupDims, threadGroupCount, kernelDescriptor.kernelAttributes.numGrfRequired, numThreadsPerThreadGroup, cmd); + + EncodeWalkerArgs walkerArgs{ + .kernelExecutionType = KernelExecutionType::defaultType, + .requiredDispatchWalkOrder = args.requiredDispatchWalkOrder, + .localRegionSize = args.localRegionSize, + .maxFrontEndThreads = args.device->getDeviceInfo().maxFrontEndThreads, + .requiredSystemFence = args.requiresSystemMemoryFence(), + .hasSample = false}; + + EncodeDispatchKernel::encodeAdditionalWalkerFields(rootDeviceEnvironment, cmd, walkerArgs); + EncodeDispatchKernel::encodeWalkerPostSyncFields(cmd, walkerArgs); + EncodeDispatchKernel::template encodeComputeDispatchAllWalker(cmd, nullptr, rootDeviceEnvironment, walkerArgs); + + memcpy_s(iddPtr, sizeof(idd), &idd, sizeof(idd)); + + if (NEO::PauseOnGpuProperties::pauseModeAllowed(NEO::debugManager.flags.PauseOnEnqueue.get(), args.device->debugExecutionCounter.load(), NEO::PauseOnGpuProperties::PauseMode::BeforeWorkload)) { + void *commandBuffer = listCmdBufferStream->getSpace(MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(args.device->getRootDeviceEnvironment(), false)); + args.additionalCommands->push_back(commandBuffer); + + EncodeSemaphore::applyMiSemaphoreWaitCommand(*listCmdBufferStream, *args.additionalCommands); + } + + auto buffer = listCmdBufferStream->getSpaceForCmd(); + *buffer = cmd; + + { + auto mediaStateFlush = listCmdBufferStream->getSpaceForCmd(); + *mediaStateFlush = Family::cmdInitMediaStateFlush; + } + + args.partitionCount = 1; + + if (NEO::PauseOnGpuProperties::pauseModeAllowed(NEO::debugManager.flags.PauseOnEnqueue.get(), args.device->debugExecutionCounter.load(), NEO::PauseOnGpuProperties::PauseMode::AfterWorkload)) { + void *commandBuffer = listCmdBufferStream->getSpace(MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(args.device->getRootDeviceEnvironment(), false)); + args.additionalCommands->push_back(commandBuffer); + + EncodeSemaphore::applyMiSemaphoreWaitCommand(*listCmdBufferStream, *args.additionalCommands); + } +} + +template +void EncodeMediaInterfaceDescriptorLoad::encode(CommandContainer &container, IndirectHeap *childDsh) { + using MEDIA_STATE_FLUSH = typename Family::MEDIA_STATE_FLUSH; + using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename Family::MEDIA_INTERFACE_DESCRIPTOR_LOAD; + void *heapBase = nullptr; + if (childDsh != nullptr) { + heapBase = childDsh->getCpuBase(); + } else { + heapBase = container.getIndirectHeap(HeapType::dynamicState)->getCpuBase(); + } + + auto mediaStateFlush = container.getCommandStream()->getSpaceForCmd(); + *mediaStateFlush = Family::cmdInitMediaStateFlush; + + auto iddOffset = static_cast(ptrDiff(container.getIddBlock(), heapBase)); + + MEDIA_INTERFACE_DESCRIPTOR_LOAD cmd = Family::cmdInitMediaInterfaceDescriptorLoad; + cmd.setInterfaceDescriptorDataStartAddress(iddOffset); + cmd.setInterfaceDescriptorTotalLength(sizeof(INTERFACE_DESCRIPTOR_DATA) * container.getNumIddPerBlock()); + + auto buffer = container.getCommandStream()->getSpace(sizeof(cmd)); + *(decltype(cmd) *)buffer = cmd; +} + +template +inline bool EncodeDispatchKernel::isRuntimeLocalIdsGenerationRequired(uint32_t activeChannels, + const size_t *lws, + std::array walkOrder, + bool requireInputWalkOrder, + uint32_t &requiredWalkOrder, + uint32_t simd) { + requiredWalkOrder = 0u; + return true; +} + +template +template +void EncodeDispatchKernel::encodeThreadData(WalkerType &walkerCmd, + const uint32_t *startWorkGroup, + const uint32_t *numWorkGroups, + const uint32_t *workGroupSizes, + uint32_t simd, + uint32_t localIdDimensions, + uint32_t threadsPerThreadGroup, + uint32_t threadExecutionMask, + bool localIdsGenerationByRuntime, + bool inlineDataProgrammingRequired, + bool isIndirect, + uint32_t requiredWorkGroupOrder, + const RootDeviceEnvironment &rootDeviceEnvironment) { + + if (isIndirect) { + walkerCmd.setIndirectParameterEnable(true); + } else { + walkerCmd.setThreadGroupIdXDimension(static_cast(numWorkGroups[0])); + walkerCmd.setThreadGroupIdYDimension(static_cast(numWorkGroups[1])); + walkerCmd.setThreadGroupIdZDimension(static_cast(numWorkGroups[2])); + } + + if (startWorkGroup) { + walkerCmd.setThreadGroupIdStartingX(static_cast(startWorkGroup[0])); + walkerCmd.setThreadGroupIdStartingY(static_cast(startWorkGroup[1])); + walkerCmd.setThreadGroupIdStartingResumeZ(static_cast(startWorkGroup[2])); + } + + walkerCmd.setSimdSize(getSimdConfig(simd)); + + auto localWorkSize = static_cast(workGroupSizes[0] * workGroupSizes[1] * workGroupSizes[2]); + if (threadsPerThreadGroup == 0) { + threadsPerThreadGroup = getThreadsPerWG(simd, localWorkSize); + } + walkerCmd.setThreadWidthCounterMaximum(threadsPerThreadGroup); + + uint64_t executionMask = threadExecutionMask; + if (executionMask == 0) { + auto remainderSimdLanes = localWorkSize & (simd - 1); + executionMask = maxNBitValue(remainderSimdLanes); + if (!executionMask) + executionMask = ~executionMask; + } + + constexpr uint32_t maxDword = std::numeric_limits::max(); + walkerCmd.setRightExecutionMask(static_cast(executionMask)); + walkerCmd.setBottomExecutionMask(maxDword); +} + +template +template +void EncodeDispatchKernel::programBarrierEnable(InterfaceDescriptorType &interfaceDescriptor, + const KernelDescriptor &kernelDescriptor, + const HardwareInfo &hwInfo) { + interfaceDescriptor.setBarrierEnable(kernelDescriptor.kernelAttributes.barrierCount); +} + +template +template +inline void EncodeDispatchKernel::encodeAdditionalWalkerFields(const RootDeviceEnvironment &rootDeviceEnvironment, WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs) {} + +template +template +inline void EncodeDispatchKernel::encodeWalkerPostSyncFields(WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs) {} + +template +template +inline void EncodeDispatchKernel::encodeComputeDispatchAllWalker(WalkerType &walkerCmd, const InterfaceDescriptorType *idd, const RootDeviceEnvironment &rootDeviceEnvironment, const EncodeWalkerArgs &walkerArgs) {} + +template +template +void EncodeDispatchKernel::setupPreferredSlmSize(InterfaceDescriptorType *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy) {} + +template +inline bool EncodeDispatchKernel::isDshNeeded(const DeviceInfo &deviceInfo) { + return true; +} + +template +void EncodeStateBaseAddress::setSbaAddressesForDebugger(NEO::Debugger::SbaAddresses &sbaAddress, const STATE_BASE_ADDRESS &sbaCmd) { + sbaAddress.indirectObjectBaseAddress = sbaCmd.getIndirectObjectBaseAddress(); + sbaAddress.bindlessSurfaceStateBaseAddress = sbaCmd.getBindlessSurfaceStateBaseAddress(); + sbaAddress.dynamicStateBaseAddress = sbaCmd.getDynamicStateBaseAddress(); + sbaAddress.generalStateBaseAddress = sbaCmd.getGeneralStateBaseAddress(); + sbaAddress.instructionBaseAddress = sbaCmd.getInstructionBaseAddress(); + sbaAddress.surfaceStateBaseAddress = sbaCmd.getSurfaceStateBaseAddress(); +} + +template +void EncodeStateBaseAddress::encode(EncodeStateBaseAddressArgs &args) { + auto &device = *args.container->getDevice(); + + if (args.container->isAnyHeapDirty()) { + EncodeWA::encodeAdditionalPipelineSelect(*args.container->getCommandStream(), {}, true, device.getRootDeviceEnvironment(), args.isRcs); + } + + auto gmmHelper = device.getGmmHelper(); + + auto dsh = args.container->isHeapDirty(HeapType::dynamicState) ? args.container->getIndirectHeap(HeapType::dynamicState) : nullptr; + auto ioh = args.container->isHeapDirty(HeapType::indirectObject) ? args.container->getIndirectHeap(HeapType::indirectObject) : nullptr; + auto ssh = args.container->isHeapDirty(HeapType::surfaceState) ? args.container->getIndirectHeap(HeapType::surfaceState) : nullptr; + auto isDebuggerActive = device.getDebugger() != nullptr; + uint64_t globalHeapsBase = 0; + uint64_t bindlessSurfStateBase = 0; + bool useGlobalSshAndDsh = false; + + if (device.getBindlessHeapsHelper()) { + bindlessSurfStateBase = device.getBindlessHeapsHelper()->getGlobalHeapsBase(); + globalHeapsBase = device.getBindlessHeapsHelper()->getGlobalHeapsBase(); + useGlobalSshAndDsh = true; + } + + StateBaseAddressHelperArgs stateBaseAddressHelperArgs = { + 0, // generalStateBaseAddress + args.container->getIndirectObjectHeapBaseAddress(), // indirectObjectHeapBaseAddress + args.container->getInstructionHeapBaseAddress(), // instructionHeapBaseAddress + globalHeapsBase, // globalHeapsBaseAddress + 0, // surfaceStateBaseAddress + bindlessSurfStateBase, // bindlessSurfaceStateBaseAddress + &args.sbaCmd, // stateBaseAddressCmd + args.sbaProperties, // sbaProperties + dsh, // dsh + ioh, // ioh + ssh, // ssh + gmmHelper, // gmmHelper + args.statelessMocsIndex, // statelessMocsIndex + args.l1CachePolicy, // l1CachePolicy + args.l1CachePolicyDebuggerActive, // l1CachePolicyDebuggerActive + NEO::MemoryCompressionState::notApplicable, // memoryCompressionState + false, // setInstructionStateBaseAddress + false, // setGeneralStateBaseAddress + useGlobalSshAndDsh, // useGlobalHeapsBaseAddress + false, // isMultiOsContextCapable + false, // areMultipleSubDevicesInContext + false, // overrideSurfaceStateBaseAddress + isDebuggerActive, // isDebuggerActive + args.doubleSbaWa, // doubleSbaWa + args.heaplessModeEnabled // heaplessModeEnabled + }; + + StateBaseAddressHelper::programStateBaseAddressIntoCommandStream(stateBaseAddressHelperArgs, + *args.container->getCommandStream()); + + EncodeWA::encodeAdditionalPipelineSelect(*args.container->getCommandStream(), {}, false, device.getRootDeviceEnvironment(), args.isRcs); +} + +template +size_t EncodeStateBaseAddress::getRequiredSizeForStateBaseAddress(Device &device, CommandContainer &container, bool isRcs) { + return sizeof(typename Family::STATE_BASE_ADDRESS) + 2 * EncodeWA::getAdditionalPipelineSelectSize(device, isRcs); +} + +template +void EncodeMiFlushDW::adjust(MI_FLUSH_DW *miFlushDwCmd, const ProductHelper &productHelper) {} + +template +inline void EncodeWA::addPipeControlPriorToNonPipelinedStateCommand(LinearStream &commandStream, PipeControlArgs args, + const RootDeviceEnvironment &rootDeviceEnvironment, bool isRcs) { + MemorySynchronizationCommands::addSingleBarrier(commandStream, args); +} + +template +inline void EncodeWA::adjustCompressionFormatForPlanarImage(uint32_t &compressionFormat, int plane) { +} + +template +void EncodeSurfaceState::setCoherencyType(R_SURFACE_STATE *surfaceState, COHERENCY_TYPE coherencyType) { + surfaceState->setCoherencyType(coherencyType); +} + +template +void EncodeSemaphore::programMiSemaphoreWait(MI_SEMAPHORE_WAIT *cmd, + uint64_t compareAddress, + uint64_t compareData, + COMPARE_OPERATION compareMode, + bool registerPollMode, + bool waitMode, + bool useQwordData, + bool indirect, + bool switchOnUnsuccessful) { + constexpr uint64_t upper32b = static_cast(std::numeric_limits::max()) << 32; + UNRECOVERABLE_IF(useQwordData || (compareData & upper32b)); + UNRECOVERABLE_IF(indirect); + + MI_SEMAPHORE_WAIT localCmd = Family::cmdInitMiSemaphoreWait; + localCmd.setCompareOperation(compareMode); + localCmd.setSemaphoreDataDword(static_cast(compareData)); + localCmd.setSemaphoreGraphicsAddress(compareAddress); + localCmd.setWaitMode(waitMode ? MI_SEMAPHORE_WAIT::WAIT_MODE::WAIT_MODE_POLLING_MODE : MI_SEMAPHORE_WAIT::WAIT_MODE::WAIT_MODE_SIGNAL_MODE); + + *cmd = localCmd; +} + +template +void EncodeEnableRayTracing::programEnableRayTracing(LinearStream &commandStream, uint64_t backBuffer) { +} + +template +inline void EncodeStoreMemory::programStoreDataImm(MI_STORE_DATA_IMM *cmdBuffer, + uint64_t gpuAddress, + uint32_t dataDword0, + uint32_t dataDword1, + bool storeQword, + bool workloadPartitionOffset) { + MI_STORE_DATA_IMM storeDataImmediate = Family::cmdInitStoreDataImm; + storeDataImmediate.setAddress(gpuAddress); + storeDataImmediate.setStoreQword(storeQword); + storeDataImmediate.setDataDword0(dataDword0); + if (storeQword) { + storeDataImmediate.setDataDword1(dataDword1); + storeDataImmediate.setDwordLength(MI_STORE_DATA_IMM::DWORD_LENGTH::DWORD_LENGTH_STORE_QWORD); + } else { + storeDataImmediate.setDwordLength(MI_STORE_DATA_IMM::DWORD_LENGTH::DWORD_LENGTH_STORE_DWORD); + } + EncodeStoreMemory::encodeForceCompletionCheck(storeDataImmediate); + + *cmdBuffer = storeDataImmediate; +} + +template +template +void EncodeDispatchKernel::setupPostSyncMocs(WalkerType &walkerCmd, const RootDeviceEnvironment &rootDeviceEnvironment, bool dcFlush) {} + +template +template +void EncodeDispatchKernel::setupPostSyncForRegularEvent(WalkerType &walkerCmd, const EncodeDispatchKernelArgs &args) {} + +template +template +void EncodeDispatchKernel::setupPostSyncForInOrderExec(WalkerType &walkerCmd, const EncodeDispatchKernelArgs &args) {} + +template +template +void EncodeDispatchKernel::adjustWalkOrder(WalkerType &walkerCmd, uint32_t requiredWorkGroupOrder, const RootDeviceEnvironment &rootDeviceEnvironment) {} + +template +size_t EncodeDispatchKernel::additionalSizeRequiredDsh(uint32_t iddCount) { + return iddCount * sizeof(typename Family::INTERFACE_DESCRIPTOR_DATA); +} + +template +inline size_t EncodeDispatchKernel::getInlineDataOffset(EncodeDispatchKernelArgs &args) { + return 0; +} + +template +template +void EncodeDispatchKernel::forceComputeWalkerPostSyncFlushWithWrite(WalkerType &walkerCmd) { +} + +template +uint32_t EncodeDispatchKernel::alignSlmSize(uint32_t slmSize) { + if (slmSize == 0u) { + return 0u; + } + slmSize = std::max(slmSize, 1024u); + slmSize = Math::nextPowerOfTwo(slmSize); + UNRECOVERABLE_IF(slmSize > 64u * MemoryConstants::kiloByte); + return slmSize; +} + +template +uint32_t EncodeDispatchKernel::computeSlmValues(const HardwareInfo &hwInfo, uint32_t slmSize) { + auto value = std::max(slmSize, 1024u); + value = Math::nextPowerOfTwo(value); + value = Math::getMinLsbSet(value); + value = value - 9; + DEBUG_BREAK_IF(value > 7); + return value * !!slmSize; +} + +template +bool EncodeDispatchKernel::singleTileExecImplicitScalingRequired(bool cooperativeKernel) { + return cooperativeKernel; +} + +template +size_t EncodeStates::getSshHeapSize() { + return 64 * MemoryConstants::kiloByte; +} + +template +void InOrderPatchCommandHelpers::PatchCmd::patchComputeWalker(uint64_t appendCounterValue) { + UNRECOVERABLE_IF(true); +} + +template +template +void EncodeDispatchKernel::overrideDefaultValues(WalkerType &walkerCmd, InterfaceDescriptorType &interfaceDescriptor) { +} + +template +template +void EncodeDispatchKernel::encodeThreadGroupDispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, + const uint32_t *threadGroupDimensions, const uint32_t threadGroupCount, const uint32_t grfCount, const uint32_t threadsPerThreadGroup, + WalkerType &walkerCmd) { +} + +template +size_t EncodeDispatchKernel::getScratchPtrOffsetOfImplicitArgs() { + return 0; +} + +template +void EncodeSurfaceState::setPitchForScratch(R_SURFACE_STATE *surfaceState, uint32_t pitch, const ProductHelper &productHelper) { + surfaceState->setSurfacePitch(pitch); +} + +template +uint32_t EncodeSurfaceState::getPitchForScratchInBytes(R_SURFACE_STATE *surfaceState, const ProductHelper &productHelper) { + return surfaceState->getSurfacePitch(); +} + +template +void EncodeSemaphore::appendSemaphoreCommand(MI_SEMAPHORE_WAIT &cmd, uint64_t compareData, bool indirect, bool useQwordData, bool switchOnUnsuccessful) { + constexpr uint64_t upper32b = static_cast(std::numeric_limits::max()) << 32; + UNRECOVERABLE_IF(useQwordData || (compareData & upper32b)); +} + +template +template +void EncodeDispatchKernel::setScratchAddress(uint64_t &scratchAddress, uint32_t requiredScratchSlot0Size, uint32_t requiredScratchSlot1Size, IndirectHeap *ssh, CommandStreamReceiver &submissionCsr) { +} + +template +template +void EncodeDispatchKernel::encodeEuSchedulingPolicy(InterfaceDescriptorType *pInterfaceDescriptor, const KernelDescriptor &kernelDesc, int32_t defaultPipelinedThreadArbitrationPolicy) { +} + +template +template +void EncodeDispatchKernel::setWalkerRegionSettings(WalkerType &walkerCmd, const NEO::Device &device, uint32_t partitionCount, uint32_t workgroupSize, uint32_t threadGroupCount, uint32_t maxWgCountPerTile, bool requiredDispatchWalkOrder) {} + +template +template +void EncodeDispatchKernel::adjustTimestampPacket(WalkerType &walkerCmd, const EncodeDispatchKernelArgs &args) {} + template <> size_t EncodeWA::getAdditionalPipelineSelectSize(Device &device, bool isRcs) { size_t size = 0;