/* * Copyright (C) 2020-2025 Intel Corporation * * SPDX-License-Identifier: MIT * */ #include "shared/source/command_container/command_encoder.h" #include "shared/source/command_container/encode_surface_state.h" #include "shared/source/command_container/implicit_scaling.h" #include "shared/source/command_stream/command_stream_receiver.h" #include "shared/source/command_stream/linear_stream.h" #include "shared/source/command_stream/preemption.h" #include "shared/source/command_stream/scratch_space_controller.h" #include "shared/source/command_stream/stream_properties.h" #include "shared/source/debug_settings/debug_settings_manager.h" #include "shared/source/execution_environment/root_device_environment.h" #include "shared/source/gmm_helper/client_context/gmm_client_context.h" #include "shared/source/gmm_helper/gmm.h" #include "shared/source/gmm_helper/gmm_helper.h" #include "shared/source/gmm_helper/resource_info.h" #include "shared/source/helpers/basic_math.h" #include "shared/source/helpers/cache_policy.h" #include "shared/source/helpers/compiler_product_helper.h" #include "shared/source/helpers/constants.h" #include "shared/source/helpers/gfx_core_helper.h" #include "shared/source/helpers/hw_walk_order.h" #include "shared/source/helpers/in_order_cmd_helpers.h" #include "shared/source/helpers/pause_on_gpu_properties.h" #include "shared/source/helpers/pipe_control_args.h" #include "shared/source/helpers/ray_tracing_helper.h" #include "shared/source/helpers/simd_helper.h" #include "shared/source/helpers/state_base_address.h" #include "shared/source/kernel/dispatch_kernel_encoder_interface.h" #include "shared/source/kernel/implicit_args_helper.h" #include "shared/source/kernel/kernel_descriptor.h" #include "shared/source/os_interface/product_helper.h" #include "shared/source/release_helper/release_helper.h" #include "encode_dispatch_kernel_args_ext.h" #include "encode_surface_state_args.h" #include #include namespace NEO { template template void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDispatchKernelArgs &args) { using STATE_BASE_ADDRESS = typename Family::STATE_BASE_ADDRESS; UNRECOVERABLE_IF(args.makeCommandView && (args.cpuWalkerBuffer == nullptr || args.cpuPayloadBuffer == nullptr)); constexpr bool heaplessModeEnabled = Family::template isHeaplessMode(); const HardwareInfo &hwInfo = args.device->getHardwareInfo(); auto &rootDeviceEnvironment = args.device->getRootDeviceEnvironment(); const auto &kernelDescriptor = args.dispatchInterface->getKernelDescriptor(); auto sizeCrossThreadData = args.dispatchInterface->getCrossThreadDataSize(); auto sizePerThreadData = args.dispatchInterface->getPerThreadDataSize(); auto sizePerThreadDataForWholeGroup = args.dispatchInterface->getPerThreadDataSizeForWholeThreadGroup(); auto pImplicitArgs = args.dispatchInterface->getImplicitArgs(); LinearStream *listCmdBufferStream = container.getCommandStream(); auto threadGroupDims = static_cast(args.threadGroupDimensions); uint32_t threadDimsVec[3] = {0, 0, 0}; if (!args.isIndirect) { threadDimsVec[0] = threadGroupDims[0]; threadDimsVec[1] = threadGroupDims[1]; threadDimsVec[2] = threadGroupDims[2]; } if (!args.makeCommandView) { bool systolicModeRequired = kernelDescriptor.kernelAttributes.flags.usesSystolicPipelineSelectMode; if (container.systolicModeSupportRef() && (container.lastPipelineSelectModeRequiredRef() != systolicModeRequired)) { container.lastPipelineSelectModeRequiredRef() = systolicModeRequired; EncodeComputeMode::adjustPipelineSelect(container, kernelDescriptor); } } WalkerType walkerCmd = Family::template getInitGpuWalker(); auto &idd = walkerCmd.getInterfaceDescriptor(); EncodeDispatchKernel::setGrfInfo(&idd, kernelDescriptor.kernelAttributes.numGrfRequired, sizeCrossThreadData, sizePerThreadData, rootDeviceEnvironment); bool localIdsGenerationByRuntime = args.dispatchInterface->requiresGenerationOfLocalIdsByRuntime(); auto requiredWorkgroupOrder = args.dispatchInterface->getRequiredWorkgroupOrder(); { auto isaAllocation = args.dispatchInterface->getIsaAllocation(); UNRECOVERABLE_IF(nullptr == isaAllocation); uint64_t kernelStartPointer = args.dispatchInterface->getIsaOffsetInParentAllocation(); if constexpr (heaplessModeEnabled) { kernelStartPointer += isaAllocation->getGpuAddress(); } else { kernelStartPointer += isaAllocation->getGpuAddressToPatch(); } if (!localIdsGenerationByRuntime) { kernelStartPointer += kernelDescriptor.entryPoints.skipPerThreadDataLoad; } idd.setKernelStartPointer(kernelStartPointer); } if (args.dispatchInterface->getKernelDescriptor().kernelAttributes.flags.usesAssert && args.device->getL0Debugger() != nullptr) { idd.setSoftwareExceptionEnable(1); } auto threadsPerThreadGroup = args.dispatchInterface->getNumThreadsPerThreadGroup(); idd.setNumberOfThreadsInGpgpuThreadGroup(threadsPerThreadGroup); EncodeDispatchKernel::programBarrierEnable(idd, kernelDescriptor, hwInfo); EncodeDispatchKernel::encodeEuSchedulingPolicy(&idd, kernelDescriptor, args.defaultPipelinedThreadArbitrationPolicy); auto releaseHelper = rootDeviceEnvironment.getReleaseHelper(); auto slmSize = EncodeDispatchKernel::computeSlmValues(hwInfo, args.dispatchInterface->getSlmTotalSize(), releaseHelper, heaplessModeEnabled); if (debugManager.flags.OverrideSlmAllocationSize.get() != -1) { slmSize = static_cast(debugManager.flags.OverrideSlmAllocationSize.get()); } idd.setSharedLocalMemorySize(slmSize); auto bindingTableStateCount = kernelDescriptor.payloadMappings.bindingTable.numEntries; bool sshProgrammingRequired = true; auto &productHelper = args.device->getProductHelper(); if (productHelper.isSkippingStatefulInformationRequired(kernelDescriptor)) { bindingTableStateCount = 0u; sshProgrammingRequired = false; } if (sshProgrammingRequired && !args.makeCommandView) { bool isBindlessKernel = NEO::KernelDescriptor::isBindlessAddressingKernel(kernelDescriptor); if (isBindlessKernel) { bool globalBindlessSsh = args.device->getBindlessHeapsHelper() != nullptr; auto sshHeapSize = args.dispatchInterface->getSurfaceStateHeapDataSize(); if (sshHeapSize > 0u) { auto ssh = args.surfaceStateHeap; if (ssh == nullptr) { container.prepareBindfulSsh(); ssh = container.getHeapWithRequiredSizeAndAlignment(HeapType::surfaceState, sshHeapSize, NEO::EncodeDispatchKernel::getDefaultSshAlignment()); } uint64_t bindlessSshBaseOffset = ptrDiff(ssh->getSpace(0), ssh->getCpuBase()); if (globalBindlessSsh) { bindlessSshBaseOffset += ptrDiff(ssh->getGraphicsAllocation()->getGpuAddress(), ssh->getGraphicsAllocation()->getGpuBaseAddress()); } DEBUG_BREAK_IF(bindingTableStateCount > 0u); if (bindingTableStateCount == 0) { // Allocate space for new ssh data auto dstSurfaceState = ssh->getSpace(sshHeapSize); memcpy_s(dstSurfaceState, sshHeapSize, args.dispatchInterface->getSurfaceStateHeapData(), sshHeapSize); } args.dispatchInterface->patchBindlessOffsetsInCrossThreadData(bindlessSshBaseOffset); } } else { if constexpr (heaplessModeEnabled == false) { if (bindingTableStateCount > 0u) { auto ssh = args.surfaceStateHeap; if (ssh == nullptr) { container.prepareBindfulSsh(); ssh = container.getHeapWithRequiredSizeAndAlignment(HeapType::surfaceState, args.dispatchInterface->getSurfaceStateHeapDataSize(), NEO::EncodeDispatchKernel::getDefaultSshAlignment()); } auto bindingTablePointer = static_cast(EncodeSurfaceState::pushBindingTableAndSurfaceStates( *ssh, args.dispatchInterface->getSurfaceStateHeapData(), args.dispatchInterface->getSurfaceStateHeapDataSize(), bindingTableStateCount, kernelDescriptor.payloadMappings.bindingTable.tableOffset)); idd.setBindingTablePointer(bindingTablePointer); } } } } auto preemptionMode = args.device->getDebugger() ? PreemptionMode::ThreadGroup : args.preemptionMode; PreemptionHelper::programInterfaceDescriptorDataPreemption(&idd, preemptionMode); uint32_t samplerCount = 0; if constexpr (Family::supportsSampler) { if (args.device->getDeviceInfo().imageSupport && !args.makeCommandView) { if (kernelDescriptor.payloadMappings.samplerTable.numSamplers > 0) { auto dsHeap = args.dynamicStateHeap; if (dsHeap == nullptr) { dsHeap = container.getIndirectHeap(HeapType::dynamicState); auto dshSizeRequired = NEO::EncodeDispatchKernel::getSizeRequiredDsh(kernelDescriptor, container.getNumIddPerBlock()); if (dsHeap->getAvailableSpace() <= dshSizeRequired) { dsHeap = container.getHeapWithRequiredSizeAndAlignment(HeapType::dynamicState, dsHeap->getMaxAvailableSpace(), NEO::EncodeDispatchKernel::getDefaultDshAlignment()); } } UNRECOVERABLE_IF(!dsHeap); auto bindlessHeapsHelper = args.device->getBindlessHeapsHelper(); samplerCount = kernelDescriptor.payloadMappings.samplerTable.numSamplers; uint64_t samplerStateOffset = EncodeStates::copySamplerState( dsHeap, kernelDescriptor.payloadMappings.samplerTable.tableOffset, kernelDescriptor.payloadMappings.samplerTable.numSamplers, kernelDescriptor.payloadMappings.samplerTable.borderColor, args.dispatchInterface->getDynamicStateHeapData(), bindlessHeapsHelper, rootDeviceEnvironment); if (bindlessHeapsHelper && !bindlessHeapsHelper->isGlobalDshSupported()) { // add offset of graphics allocation base address relative to heap base address samplerStateOffset += static_cast(ptrDiff(dsHeap->getGpuBase(), bindlessHeapsHelper->getGlobalHeapsBase())); } if (heaplessModeEnabled && bindlessHeapsHelper) { samplerStateOffset += bindlessHeapsHelper->getGlobalHeapsBase(); } args.dispatchInterface->patchSamplerBindlessOffsetsInCrossThreadData(samplerStateOffset); if constexpr (!heaplessModeEnabled) { idd.setSamplerStatePointer(static_cast(samplerStateOffset)); } } } } if constexpr (heaplessModeEnabled == false) { EncodeDispatchKernel::adjustBindingTablePrefetch(idd, samplerCount, bindingTableStateCount); } uint64_t offsetThreadData = 0u; constexpr uint32_t inlineDataSize = WalkerType::getInlineDataSize(); auto crossThreadData = args.dispatchInterface->getCrossThreadData(); uint32_t inlineDataProgrammingOffset = 0u; bool inlineDataProgramming = EncodeDispatchKernel::inlineDataProgrammingRequired(kernelDescriptor); if (inlineDataProgramming) { inlineDataProgrammingOffset = std::min(inlineDataSize, sizeCrossThreadData); auto dest = reinterpret_cast(walkerCmd.getInlineDataPointer()); memcpy_s(dest, inlineDataSize, crossThreadData, inlineDataProgrammingOffset); sizeCrossThreadData -= inlineDataProgrammingOffset; crossThreadData = ptrOffset(crossThreadData, inlineDataProgrammingOffset); inlineDataProgramming = inlineDataProgrammingOffset != 0; } auto scratchAddressForImmediatePatching = EncodeDispatchKernel::getScratchAddressForImmediatePatching(container, args); uint32_t sizeThreadData = sizePerThreadDataForWholeGroup + sizeCrossThreadData; uint32_t sizeForImplicitArgsPatching = NEO::ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, !localIdsGenerationByRuntime, rootDeviceEnvironment); uint32_t sizeForImplicitArgsStruct = NEO::ImplicitArgsHelper::getSizeForImplicitArgsStruct(pImplicitArgs, kernelDescriptor, true, rootDeviceEnvironment); uint32_t iohRequiredSize = sizeThreadData + sizeForImplicitArgsPatching + args.reserveExtraPayloadSpace; IndirectParamsInInlineDataArgs encodeIndirectParamsArgs{}; { void *ptr = nullptr; if (!args.makeCommandView) { auto heap = container.getIndirectHeap(HeapType::indirectObject); UNRECOVERABLE_IF(!heap); heap->align(Family::cacheLineSize); if (args.isKernelDispatchedFromImmediateCmdList) { ptr = container.getHeapWithRequiredSizeAndAlignment(HeapType::indirectObject, iohRequiredSize, Family::indirectDataAlignment)->getSpace(iohRequiredSize); } else { ptr = container.getHeapSpaceAllowGrow(HeapType::indirectObject, iohRequiredSize); } offsetThreadData = (is64bit ? heap->getHeapGpuStartOffset() : heap->getHeapGpuBase()) + static_cast(heap->getUsed() - sizeThreadData - args.reserveExtraPayloadSpace); if (pImplicitArgs) { offsetThreadData -= sizeForImplicitArgsStruct; pImplicitArgs->setLocalIdTablePtr(heap->getGraphicsAllocation()->getGpuAddress() + heap->getUsed() - iohRequiredSize); EncodeDispatchKernel::patchScratchAddressInImplicitArgs(*pImplicitArgs, scratchAddressForImmediatePatching, args.immediateScratchAddressPatching); ptr = NEO::ImplicitArgsHelper::patchImplicitArgs(ptr, *pImplicitArgs, kernelDescriptor, std::make_pair(!localIdsGenerationByRuntime, requiredWorkgroupOrder), rootDeviceEnvironment, &args.outImplicitArgsPtr); } if (args.isIndirect) { auto gpuPtr = heap->getGraphicsAllocation()->getGpuAddress() + static_cast(heap->getUsed() - sizeThreadData - inlineDataProgrammingOffset); uint64_t implicitArgsGpuPtr = 0u; if (pImplicitArgs) { implicitArgsGpuPtr = gpuPtr + inlineDataProgrammingOffset - sizeForImplicitArgsStruct; } EncodeIndirectParams::encode(container, gpuPtr, args.dispatchInterface, implicitArgsGpuPtr, &encodeIndirectParamsArgs); } } else { ptr = args.cpuPayloadBuffer; } if (sizeCrossThreadData > 0) { memcpy_s(ptr, sizeCrossThreadData, crossThreadData, sizeCrossThreadData); } auto perThreadDataPtr = args.dispatchInterface->getPerThreadData(); if (perThreadDataPtr != nullptr) { ptr = ptrOffset(ptr, sizeCrossThreadData); memcpy_s(ptr, sizePerThreadDataForWholeGroup, perThreadDataPtr, sizePerThreadDataForWholeGroup); } } if (args.isHeaplessStateInitEnabled == false && !args.makeCommandView) { if (container.isAnyHeapDirty() || args.requiresUncachedMocs) { PipeControlArgs syncArgs; syncArgs.dcFlushEnable = args.postSyncArgs.dcFlushEnable; MemorySynchronizationCommands::addSingleBarrier(*container.getCommandStream(), syncArgs); STATE_BASE_ADDRESS sbaCmd; auto gmmHelper = container.getDevice()->getGmmHelper(); uint32_t statelessMocsIndex = args.requiresUncachedMocs ? (gmmHelper->getUncachedMOCS() >> 1) : (gmmHelper->getL3EnabledMOCS() >> 1); auto l1CachePolicy = container.l1CachePolicyDataRef()->getL1CacheValue(false); auto l1CachePolicyDebuggerActive = container.l1CachePolicyDataRef()->getL1CacheValue(true); EncodeStateBaseAddressArgs encodeStateBaseAddressArgs = { &container, // container sbaCmd, // sbaCmd nullptr, // sbaProperties statelessMocsIndex, // statelessMocsIndex l1CachePolicy, // l1CachePolicy l1CachePolicyDebuggerActive, // l1CachePolicyDebuggerActive args.partitionCount > 1, // multiOsContextCapable args.isRcs, // isRcs container.doubleSbaWaRef(), // doubleSbaWa heaplessModeEnabled // heaplessModeEnabled }; EncodeStateBaseAddress::encode(encodeStateBaseAddressArgs); container.setDirtyStateForAllHeaps(false); bool sbaTrackingEnabled = NEO::Debugger::isDebugEnabled(args.isInternal) && args.device->getL0Debugger(); NEO::EncodeStateBaseAddress::setSbaTrackingForL0DebuggerIfEnabled(sbaTrackingEnabled, *args.device, *container.getCommandStream(), sbaCmd, container.isUsingPrimaryBuffer()); } } if (!args.makeCommandView) { if (NEO::PauseOnGpuProperties::pauseModeAllowed(NEO::debugManager.flags.PauseOnEnqueue.get(), args.device->debugExecutionCounter.load(), NEO::PauseOnGpuProperties::PauseMode::BeforeWorkload)) { void *commandBuffer = listCmdBufferStream->getSpace(MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment, false)); args.additionalCommands->push_back(commandBuffer); EncodeSemaphore::applyMiSemaphoreWaitCommand(*listCmdBufferStream, *args.additionalCommands); } } uint8_t *inlineData = reinterpret_cast(walkerCmd.getInlineDataPointer()); EncodeDispatchKernel::programInlineDataHeapless(inlineData, args, container, offsetThreadData, scratchAddressForImmediatePatching); if constexpr (heaplessModeEnabled == false) { if (!args.makeCommandView) { walkerCmd.setIndirectDataStartAddress(static_cast(offsetThreadData)); walkerCmd.setIndirectDataLength(sizeThreadData); } } container.getIndirectHeap(HeapType::indirectObject)->align(NEO::EncodeDispatchKernel::getDefaultIOHAlignment()); EncodeDispatchKernel::encodeThreadData(walkerCmd, nullptr, threadGroupDims, args.dispatchInterface->getGroupSize(), kernelDescriptor.kernelAttributes.simdSize, kernelDescriptor.kernelAttributes.numLocalIdChannels, threadsPerThreadGroup, args.dispatchInterface->getThreadExecutionMask(), localIdsGenerationByRuntime, inlineDataProgramming, args.isIndirect, requiredWorkgroupOrder, rootDeviceEnvironment); if (args.postSyncArgs.inOrderExecInfo) { EncodePostSync::setupPostSyncForInOrderExec(walkerCmd, args.postSyncArgs); } else if (args.postSyncArgs.eventAddress) { EncodePostSync::setupPostSyncForRegularEvent(walkerCmd, args.postSyncArgs); } else { EncodeDispatchKernel::forceComputeWalkerPostSyncFlushWithWrite(walkerCmd); } if (debugManager.flags.ForceComputeWalkerPostSyncFlush.get() == 1) { auto &postSync = walkerCmd.getPostSync(); postSync.setDataportPipelineFlush(true); postSync.setDataportSubsliceCacheFlush(true); } walkerCmd.setPredicateEnable(args.isPredicate); auto threadGroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension(); EncodeDispatchKernel::encodeThreadGroupDispatch(idd, *args.device, hwInfo, threadDimsVec, threadGroupCount, kernelDescriptor.kernelAttributes.numGrfRequired, threadsPerThreadGroup, walkerCmd); if (debugManager.flags.PrintKernelDispatchParameters.get()) { fprintf(stdout, "kernel, %s, grfCount, %d, simdSize, %d, tilesCount, %d, implicitScaling, %s, threadGroupCount, %d, numberOfThreadsInGpgpuThreadGroup, %d, threadGroupDimensions, %d, %d, %d, threadGroupDispatchSize enum, %d\n", kernelDescriptor.kernelMetadata.kernelName.c_str(), kernelDescriptor.kernelAttributes.numGrfRequired, kernelDescriptor.kernelAttributes.simdSize, args.device->getNumSubDevices(), ImplicitScalingHelper::isImplicitScalingEnabled(args.device->getDeviceBitfield(), true) ? "Yes" : "no", threadGroupCount, idd.getNumberOfThreadsInGpgpuThreadGroup(), walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension(), idd.getThreadGroupDispatchSize()); } EncodeDispatchKernel::setupPreferredSlmSize(&idd, rootDeviceEnvironment, threadsPerThreadGroup, args.dispatchInterface->getSlmTotalSize(), args.dispatchInterface->getSlmPolicy()); auto kernelExecutionType = args.isCooperative ? KernelExecutionType::concurrent : KernelExecutionType::defaultType; EncodeWalkerArgs walkerArgs{ .argsExtended = args.extendedArgs, .kernelExecutionType = kernelExecutionType, .requiredDispatchWalkOrder = args.requiredDispatchWalkOrder, .localRegionSize = args.localRegionSize, .maxFrontEndThreads = args.device->getDeviceInfo().maxFrontEndThreads, .requiredSystemFence = args.postSyncArgs.requiresSystemMemoryFence(), .hasSample = kernelDescriptor.kernelAttributes.flags.hasSample, .l0DebuggerEnabled = args.device->getL0Debugger() != nullptr}; EncodeDispatchKernel::encodeAdditionalWalkerFields(rootDeviceEnvironment, walkerCmd, walkerArgs); EncodeDispatchKernel::encodeWalkerPostSyncFields(walkerCmd, rootDeviceEnvironment, walkerArgs); EncodeDispatchKernel::encodeComputeDispatchAllWalker(walkerCmd, &idd, rootDeviceEnvironment, walkerArgs); EncodeDispatchKernel::overrideDefaultValues(walkerCmd, idd); uint32_t workgroupSize = args.dispatchInterface->getGroupSize()[0] * args.dispatchInterface->getGroupSize()[1] * args.dispatchInterface->getGroupSize()[2]; bool isRequiredDispatchWorkGroupOrder = args.requiredDispatchWalkOrder != NEO::RequiredDispatchWalkOrder::none; if (args.partitionCount > 1 && !args.isInternal) { const uint64_t workPartitionAllocationGpuVa = args.device->getDefaultEngine().commandStreamReceiver->getWorkPartitionAllocationGpuAddress(); ImplicitScalingDispatchCommandArgs implicitScalingArgs{ workPartitionAllocationGpuVa, // workPartitionAllocationGpuVa args.device, // device &args.outWalkerPtr, // outWalkerPtr args.requiredPartitionDim, // requiredPartitionDim args.partitionCount, // partitionCount workgroupSize, // workgroupSize threadGroupCount, // threadGroupCount args.maxWgCountPerTile, // maxWgCountPerTile !(container.getFlushTaskUsedForImmediate() || container.isUsingPrimaryBuffer()), // useSecondaryBatchBuffer !args.isKernelDispatchedFromImmediateCmdList, // apiSelfCleanup args.postSyncArgs.dcFlushEnable, // dcFlush EncodeDispatchKernel::singleTileExecImplicitScalingRequired(args.isCooperative), // forceExecutionOnSingleTile args.makeCommandView, // blockDispatchToCommandBuffer isRequiredDispatchWorkGroupOrder}; // isRequiredDispatchWorkGroupOrder ImplicitScalingDispatch::dispatchCommands(*listCmdBufferStream, walkerCmd, args.device->getDeviceBitfield(), implicitScalingArgs); args.partitionCount = implicitScalingArgs.partitionCount; } else { args.partitionCount = 1; EncodeDispatchKernel::setWalkerRegionSettings(walkerCmd, *args.device, args.partitionCount, workgroupSize, threadGroupCount, args.maxWgCountPerTile, isRequiredDispatchWorkGroupOrder); if (!args.makeCommandView) { auto buffer = listCmdBufferStream->getSpaceForCmd(); args.outWalkerPtr = buffer; *buffer = walkerCmd; } } if (args.isIndirect) { auto walkerGpuVa = listCmdBufferStream->getGpuBase() + ptrDiff(args.outWalkerPtr, listCmdBufferStream->getCpuBase()); EncodeIndirectParams::applyInlineDataGpuVA(encodeIndirectParamsArgs, walkerGpuVa + ptrDiff(walkerCmd.getInlineDataPointer(), &walkerCmd)); } if (args.cpuWalkerBuffer) { *reinterpret_cast(args.cpuWalkerBuffer) = walkerCmd; } if (!args.makeCommandView) { if (NEO::PauseOnGpuProperties::pauseModeAllowed(NEO::debugManager.flags.PauseOnEnqueue.get(), args.device->debugExecutionCounter.load(), NEO::PauseOnGpuProperties::PauseMode::AfterWorkload)) { void *commandBuffer = listCmdBufferStream->getSpace(MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment, false)); args.additionalCommands->push_back(commandBuffer); EncodeSemaphore::applyMiSemaphoreWaitCommand(*listCmdBufferStream, *args.additionalCommands); } } } template template void EncodePostSync::setupPostSyncForRegularEvent(CommandType &cmd, const EncodePostSyncArgs &args) { using POSTSYNC_DATA = decltype(Family::template getPostSyncType()); auto &postSync = cmd.getPostSync(); auto operationType = POSTSYNC_DATA::OPERATION_WRITE_IMMEDIATE_DATA; uint64_t gpuVa = args.eventAddress; uint64_t immData = args.postSyncImmValue; if (args.isTimestampEvent) { operationType = POSTSYNC_DATA::OPERATION_WRITE_TIMESTAMP; immData = 0; UNRECOVERABLE_IF(!(isAligned(gpuVa))); } else { UNRECOVERABLE_IF(!(isAligned(gpuVa))); } uint32_t mocs = getPostSyncMocs(args.device->getRootDeviceEnvironment(), args.dcFlushEnable); setPostSyncData(postSync, operationType, gpuVa, immData, 0, mocs, false, false); encodeL3Flush(cmd, args); adjustTimestampPacket(cmd, args); } template template void EncodePostSync::setPostSyncDataCommon(PostSyncT &postSyncData, typename PostSyncT::OPERATION operation, uint64_t gpuVa, uint64_t immData) { postSyncData.setOperation(operation); postSyncData.setImmediateData(immData); postSyncData.setDestinationAddress(gpuVa); } template inline uint32_t EncodePostSync::getPostSyncMocs(const RootDeviceEnvironment &rootDeviceEnvironment, bool dcFlush) { auto gmmHelper = rootDeviceEnvironment.getGmmHelper(); if (debugManager.flags.OverridePostSyncMocs.get() != -1) { return debugManager.flags.OverridePostSyncMocs.get(); } if (dcFlush) { return gmmHelper->getUncachedMOCS(); } else { return gmmHelper->getL3EnabledMOCS(); } } template bool EncodeDispatchKernel::isRuntimeLocalIdsGenerationRequired(uint32_t activeChannels, const size_t *lws, std::array walkOrder, bool requireInputWalkOrder, uint32_t &requiredWalkOrder, uint32_t simd) { if (simd == 1) { return true; } bool hwGenerationOfLocalIdsEnabled = true; if (debugManager.flags.EnableHwGenerationLocalIds.get() != -1) { hwGenerationOfLocalIdsEnabled = !!debugManager.flags.EnableHwGenerationLocalIds.get(); } if (hwGenerationOfLocalIdsEnabled) { if (activeChannels == 0) { return false; } size_t totalLwsSize = 1u; for (auto dimension = 0u; dimension < activeChannels; dimension++) { totalLwsSize *= lws[dimension]; } if (totalLwsSize > 1024u) { return true; } // check if we need to follow kernel requirements if (requireInputWalkOrder) { for (uint32_t dimension = 0; dimension < activeChannels - 1; dimension++) { if (!Math::isPow2(lws[walkOrder[dimension]])) { return true; } } auto index = 0u; while (index < HwWalkOrderHelper::walkOrderPossibilties) { if (walkOrder[0] == HwWalkOrderHelper::compatibleDimensionOrders[index][0] && walkOrder[1] == HwWalkOrderHelper::compatibleDimensionOrders[index][1]) { break; } index++; } DEBUG_BREAK_IF(index >= HwWalkOrderHelper::walkOrderPossibilties); requiredWalkOrder = index; return false; } // kernel doesn't specify any walk order requirements, check if we have any compatible for (uint32_t walkOrder = 0; walkOrder < HwWalkOrderHelper::walkOrderPossibilties; walkOrder++) { bool allDimensionsCompatible = true; for (uint32_t dimension = 0; dimension < activeChannels - 1; dimension++) { if (!Math::isPow2(lws[HwWalkOrderHelper::compatibleDimensionOrders[walkOrder][dimension]])) { allDimensionsCompatible = false; break; } } if (allDimensionsCompatible) { requiredWalkOrder = walkOrder; return false; } } } return true; } template template void EncodeDispatchKernel::encodeThreadData(WalkerType &walkerCmd, const uint32_t *startWorkGroup, const uint32_t *numWorkGroups, const uint32_t *workGroupSizes, uint32_t simd, uint32_t localIdDimensions, uint32_t threadsPerThreadGroup, uint32_t threadExecutionMask, bool localIdsGenerationByRuntime, bool inlineDataProgrammingRequired, bool isIndirect, uint32_t requiredWorkGroupOrder, const RootDeviceEnvironment &rootDeviceEnvironment) { if (isIndirect) { walkerCmd.setIndirectParameterEnable(true); } else { walkerCmd.setThreadGroupIdXDimension(static_cast(numWorkGroups[0])); walkerCmd.setThreadGroupIdYDimension(static_cast(numWorkGroups[1])); walkerCmd.setThreadGroupIdZDimension(static_cast(numWorkGroups[2])); } if (startWorkGroup) { walkerCmd.setThreadGroupIdStartingX(static_cast(startWorkGroup[0])); walkerCmd.setThreadGroupIdStartingY(static_cast(startWorkGroup[1])); walkerCmd.setThreadGroupIdStartingZ(static_cast(startWorkGroup[2])); } uint64_t executionMask = threadExecutionMask; if (executionMask == 0) { auto workGroupSize = workGroupSizes[0] * workGroupSizes[1] * workGroupSizes[2]; auto remainderSimdLanes = workGroupSize & (simd - 1); executionMask = maxNBitValue(remainderSimdLanes); if (!executionMask) { executionMask = maxNBitValue(isSimd1(simd) ? 32 : simd); } } walkerCmd.setExecutionMask(static_cast(executionMask)); walkerCmd.setSimdSize(getSimdConfig(simd)); walkerCmd.setMessageSimd(walkerCmd.getSimdSize()); if (debugManager.flags.ForceSimdMessageSizeInWalker.get() != -1) { walkerCmd.setMessageSimd(debugManager.flags.ForceSimdMessageSizeInWalker.get()); } // 1) cross-thread inline data will be put into R1, but if kernel uses local ids, then cross-thread should be put further back // so whenever local ids are driver or hw generated, reserve space by setting right values for emitLocalIds // 2) Auto-generation of local ids should be possible, when in fact local ids are used if (!localIdsGenerationByRuntime && localIdDimensions > 0) { UNRECOVERABLE_IF(localIdDimensions != 3); uint32_t emitLocalIdsForDim = (1 << 0) | (1 << 1) | (1 << 2); walkerCmd.setEmitLocalId(emitLocalIdsForDim); walkerCmd.setLocalXMaximum(static_cast(workGroupSizes[0] - 1)); walkerCmd.setLocalYMaximum(static_cast(workGroupSizes[1] - 1)); walkerCmd.setLocalZMaximum(static_cast(workGroupSizes[2] - 1)); walkerCmd.setGenerateLocalId(1); walkerCmd.setWalkOrder(requiredWorkGroupOrder); } adjustWalkOrder(walkerCmd, requiredWorkGroupOrder, rootDeviceEnvironment); if (inlineDataProgrammingRequired == true) { walkerCmd.setEmitInlineParameter(1); } } template inline bool EncodeDispatchKernel::isDshNeeded(const DeviceInfo &deviceInfo) { if constexpr (Family::supportsSampler) { return deviceInfo.imageSupport; } return false; } template void EncodeStateBaseAddress::setSbaAddressesForDebugger(NEO::Debugger::SbaAddresses &sbaAddress, const STATE_BASE_ADDRESS &sbaCmd) { sbaAddress.bindlessSurfaceStateBaseAddress = sbaCmd.getBindlessSurfaceStateBaseAddress(); sbaAddress.dynamicStateBaseAddress = sbaCmd.getDynamicStateBaseAddress(); sbaAddress.generalStateBaseAddress = sbaCmd.getGeneralStateBaseAddress(); sbaAddress.instructionBaseAddress = sbaCmd.getInstructionBaseAddress(); sbaAddress.surfaceStateBaseAddress = sbaCmd.getSurfaceStateBaseAddress(); sbaAddress.indirectObjectBaseAddress = 0; } template void EncodeStateBaseAddress::encode(EncodeStateBaseAddressArgs &args) { auto &device = *args.container->getDevice(); auto gmmHelper = device.getRootDeviceEnvironment().getGmmHelper(); auto dsh = args.container->isHeapDirty(HeapType::dynamicState) ? args.container->getIndirectHeap(HeapType::dynamicState) : nullptr; auto ioh = args.container->isHeapDirty(HeapType::indirectObject) ? args.container->getIndirectHeap(HeapType::indirectObject) : nullptr; auto ssh = args.container->isHeapDirty(HeapType::surfaceState) ? args.container->getIndirectHeap(HeapType::surfaceState) : nullptr; auto isDebuggerActive = device.getDebugger() != nullptr; bool setGeneralStateBaseAddress = args.sbaProperties ? false : true; uint64_t globalHeapsBase = 0; uint64_t bindlessSurfStateBase = 0; bool useGlobalSshAndDsh = false; if (device.getBindlessHeapsHelper()) { bindlessSurfStateBase = device.getBindlessHeapsHelper()->getGlobalHeapsBase(); globalHeapsBase = device.getBindlessHeapsHelper()->getGlobalHeapsBase(); useGlobalSshAndDsh = true; } StateBaseAddressHelperArgs stateBaseAddressHelperArgs = { 0, // generalStateBaseAddress args.container->getIndirectObjectHeapBaseAddress(), // indirectObjectHeapBaseAddress args.container->getInstructionHeapBaseAddress(), // instructionHeapBaseAddress globalHeapsBase, // globalHeapsBaseAddress 0, // surfaceStateBaseAddress bindlessSurfStateBase, // bindlessSurfaceStateBaseAddress &args.sbaCmd, // stateBaseAddressCmd args.sbaProperties, // sbaProperties dsh, // dsh ioh, // ioh ssh, // ssh gmmHelper, // gmmHelper args.statelessMocsIndex, // statelessMocsIndex args.l1CachePolicy, // l1CachePolicy args.l1CachePolicyDebuggerActive, // l1CachePolicyDebuggerActive NEO::MemoryCompressionState::notApplicable, // memoryCompressionState true, // setInstructionStateBaseAddress setGeneralStateBaseAddress, // setGeneralStateBaseAddress useGlobalSshAndDsh, // useGlobalHeapsBaseAddress args.multiOsContextCapable, // isMultiOsContextCapable false, // areMultipleSubDevicesInContext false, // overrideSurfaceStateBaseAddress isDebuggerActive, // isDebuggerActive args.doubleSbaWa, // doubleSbaWa args.heaplessModeEnabled // heaplessModeEnabled }; StateBaseAddressHelper::programStateBaseAddressIntoCommandStream(stateBaseAddressHelperArgs, *args.container->getCommandStream()); if (args.sbaProperties) { if (args.sbaProperties->bindingTablePoolBaseAddress.value != StreamProperty64::initValue) { StateBaseAddressHelper::programBindingTableBaseAddress(*args.container->getCommandStream(), static_cast(args.sbaProperties->bindingTablePoolBaseAddress.value), static_cast(args.sbaProperties->bindingTablePoolSize.value), gmmHelper); } } else if (args.container->isHeapDirty(HeapType::surfaceState) && ssh != nullptr) { auto heap = args.container->getIndirectHeap(HeapType::surfaceState); StateBaseAddressHelper::programBindingTableBaseAddress(*args.container->getCommandStream(), *heap, gmmHelper); } } template size_t EncodeStateBaseAddress::getRequiredSizeForStateBaseAddress(Device &device, CommandContainer &container, bool isRcs) { if constexpr (!Family::isHeaplessRequired()) { auto &hwInfo = device.getHardwareInfo(); auto &productHelper = device.getProductHelper(); size_t size = sizeof(typename Family::STATE_BASE_ADDRESS); if (productHelper.isAdditionalStateBaseAddressWARequired(hwInfo)) { size += sizeof(typename Family::STATE_BASE_ADDRESS); } if (container.isHeapDirty(HeapType::surfaceState)) { size += sizeof(typename Family::_3DSTATE_BINDING_TABLE_POOL_ALLOC); } return size; } else { UNRECOVERABLE_IF(true); return 0; } } template inline void EncodeMediaInterfaceDescriptorLoad::encode(CommandContainer &container, IndirectHeap *childDsh) {} template void EncodeSurfaceState::encodeExtraBufferParams(EncodeSurfaceStateArgs &args) { auto surfaceState = reinterpret_cast(args.outMemory); Gmm *gmm = args.allocation ? args.allocation->getDefaultGmm() : nullptr; uint32_t compressionFormat = 0; bool setConstCachePolicy = false; if (args.allocation && args.allocation->getAllocationType() == AllocationType::constantSurface) { setConstCachePolicy = true; } if (surfaceState->getMemoryObjectControlState() == args.gmmHelper->getL3EnabledMOCS() && debugManager.flags.ForceL1Caching.get() != 0) { setConstCachePolicy = true; } if (setConstCachePolicy == true) { surfaceState->setMemoryObjectControlState(args.gmmHelper->getL1EnabledMOCS()); } encodeExtraCacheSettings(surfaceState, args); if (EncodeSurfaceState::isAuxModeEnabled(surfaceState, gmm)) { auto resourceFormat = gmm->gmmResourceInfo->getResourceFormat(); compressionFormat = args.gmmHelper->getClientContext()->getSurfaceStateCompressionFormat(resourceFormat); if (debugManager.flags.ForceBufferCompressionFormat.get() != -1) { compressionFormat = debugManager.flags.ForceBufferCompressionFormat.get(); } } if (debugManager.flags.EnableStatelessCompressionWithUnifiedMemory.get()) { if (args.allocation && !MemoryPoolHelper::isSystemMemoryPool(args.allocation->getMemoryPool())) { setCoherencyType(surfaceState, R_SURFACE_STATE::COHERENCY_TYPE_GPU_COHERENT); setBufferAuxParamsForCCS(surfaceState); compressionFormat = debugManager.flags.FormatForStatelessCompressionWithUnifiedMemory.get(); } } surfaceState->setCompressionFormat(compressionFormat); } template void EncodeSemaphore::programMiSemaphoreWait(MI_SEMAPHORE_WAIT *cmd, uint64_t compareAddress, uint64_t compareData, COMPARE_OPERATION compareMode, bool registerPollMode, bool waitMode, bool useQwordData, bool indirect, bool switchOnUnsuccessful) { MI_SEMAPHORE_WAIT localCmd = Family::cmdInitMiSemaphoreWait; localCmd.setCompareOperation(compareMode); localCmd.setSemaphoreDataDword(static_cast(compareData)); localCmd.setSemaphoreGraphicsAddress(compareAddress); localCmd.setWaitMode(waitMode ? MI_SEMAPHORE_WAIT::WAIT_MODE::WAIT_MODE_POLLING_MODE : MI_SEMAPHORE_WAIT::WAIT_MODE::WAIT_MODE_SIGNAL_MODE); localCmd.setRegisterPollMode(registerPollMode ? MI_SEMAPHORE_WAIT::REGISTER_POLL_MODE::REGISTER_POLL_MODE_REGISTER_POLL : MI_SEMAPHORE_WAIT::REGISTER_POLL_MODE::REGISTER_POLL_MODE_MEMORY_POLL); localCmd.setIndirectSemaphoreDataDword(indirect); EncodeSemaphore::appendSemaphoreCommand(localCmd, compareData, indirect, useQwordData, switchOnUnsuccessful); *cmd = localCmd; } template inline void EncodeWA::encodeAdditionalPipelineSelect(LinearStream &stream, const PipelineSelectArgs &args, bool is3DPipeline, const RootDeviceEnvironment &rootDeviceEnvironment, bool isRcs) {} template inline size_t EncodeWA::getAdditionalPipelineSelectSize(Device &device, bool isRcs) { return 0u; } template inline void EncodeWA::addPipeControlPriorToNonPipelinedStateCommand(LinearStream &commandStream, PipeControlArgs args, const RootDeviceEnvironment &rootDeviceEnvironment, bool isRcs) { auto &productHelper = rootDeviceEnvironment.getHelper(); auto *releaseHelper = rootDeviceEnvironment.getReleaseHelper(); auto &hwInfo = *rootDeviceEnvironment.getHardwareInfo(); const auto &[isBasicWARequired, isExtendedWARequired] = productHelper.isPipeControlPriorToNonPipelinedStateCommandsWARequired(hwInfo, isRcs, releaseHelper); if (isExtendedWARequired) { args.textureCacheInvalidationEnable = true; args.hdcPipelineFlush = true; args.amfsFlushEnable = true; args.instructionCacheInvalidateEnable = true; args.constantCacheInvalidationEnable = true; args.stateCacheInvalidationEnable = true; args.dcFlushEnable = false; NEO::EncodeWA::setAdditionalPipeControlFlagsForNonPipelineStateCommand(args); } else if (isBasicWARequired) { args.hdcPipelineFlush = true; NEO::EncodeWA::setAdditionalPipeControlFlagsForNonPipelineStateCommand(args); } MemorySynchronizationCommands::addSingleBarrier(commandStream, args); } template inline void EncodeStoreMemory::programStoreDataImm(MI_STORE_DATA_IMM *cmdBuffer, uint64_t gpuAddress, uint32_t dataDword0, uint32_t dataDword1, bool storeQword, bool workloadPartitionOffset) { MI_STORE_DATA_IMM storeDataImmediate = Family::cmdInitStoreDataImm; storeDataImmediate.setAddress(gpuAddress); storeDataImmediate.setStoreQword(storeQword); storeDataImmediate.setDataDword0(dataDword0); if (storeQword) { storeDataImmediate.setDataDword1(dataDword1); storeDataImmediate.setDwordLength(MI_STORE_DATA_IMM::DWORD_LENGTH::DWORD_LENGTH_STORE_QWORD); } else { storeDataImmediate.setDwordLength(MI_STORE_DATA_IMM::DWORD_LENGTH::DWORD_LENGTH_STORE_DWORD); } storeDataImmediate.setWorkloadPartitionIdOffsetEnable(workloadPartitionOffset); EncodeStoreMemory::encodeForceCompletionCheck(storeDataImmediate); *cmdBuffer = storeDataImmediate; } template inline void EncodeStoreMMIO::appendFlags(MI_STORE_REGISTER_MEM *storeRegMem, bool workloadPartition) { storeRegMem->setMmioRemapEnable(true); storeRegMem->setWorkloadPartitionIdOffsetEnable(workloadPartition); } template size_t EncodeDispatchKernel::additionalSizeRequiredDsh(uint32_t iddCount) { return 0u; } template inline size_t EncodeDispatchKernel::getInlineDataOffset(EncodeDispatchKernelArgs &args) { using DefaultWalkerType = typename Family::DefaultWalkerType; return offsetof(DefaultWalkerType, TheStructure.Common.InlineData); } template template void EncodeDispatchKernel::forceComputeWalkerPostSyncFlushWithWrite(WalkerType &walkerCmd) { using POSTSYNC_DATA = decltype(Family::template getPostSyncType()); using OperationType = typename POSTSYNC_DATA::OPERATION; if (debugManager.flags.ForceComputeWalkerPostSyncFlushWithWrite.get() != -1) { auto &postSync = walkerCmd.getPostSync(); postSync.setDataportPipelineFlush(true); postSync.setDataportSubsliceCacheFlush(true); postSync.setDestinationAddress(static_cast(debugManager.flags.ForceComputeWalkerPostSyncFlushWithWrite.get())); postSync.setOperation(OperationType::OPERATION_WRITE_IMMEDIATE_DATA); postSync.setImmediateData(0u); } } template uint32_t EncodeDispatchKernel::alignSlmSize(uint32_t slmSize) { const uint32_t alignedSlmSizes[] = { 0u, 1u * MemoryConstants::kiloByte, 2u * MemoryConstants::kiloByte, 4u * MemoryConstants::kiloByte, 8u * MemoryConstants::kiloByte, 16u * MemoryConstants::kiloByte, 24u * MemoryConstants::kiloByte, 32u * MemoryConstants::kiloByte, 48u * MemoryConstants::kiloByte, 64u * MemoryConstants::kiloByte, 96u * MemoryConstants::kiloByte, 128u * MemoryConstants::kiloByte, }; for (auto &alignedSlmSize : alignedSlmSizes) { if (slmSize <= alignedSlmSize) { return alignedSlmSize; } } UNRECOVERABLE_IF(true); return 0; } template uint32_t EncodeDispatchKernel::computeSlmValues(const HardwareInfo &hwInfo, uint32_t slmSize, ReleaseHelper *releaseHelper, bool isHeapless) { using SHARED_LOCAL_MEMORY_SIZE = typename Family::INTERFACE_DESCRIPTOR_DATA::SHARED_LOCAL_MEMORY_SIZE; if (slmSize == 0u) { return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_SLM_ENCODES_0K; } UNRECOVERABLE_IF(slmSize > 128u * MemoryConstants::kiloByte); if (slmSize > 96u * MemoryConstants::kiloByte) { return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_SLM_ENCODES_128K; } if (slmSize > 64u * MemoryConstants::kiloByte) { return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_SLM_ENCODES_96K; } if (slmSize > 48u * MemoryConstants::kiloByte) { return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_SLM_ENCODES_64K; } if (slmSize > 32u * MemoryConstants::kiloByte) { return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_SLM_ENCODES_48K; } if (slmSize > 24u * MemoryConstants::kiloByte) { return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_SLM_ENCODES_32K; } if (slmSize > 16u * MemoryConstants::kiloByte) { return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_SLM_ENCODES_24K; } if (slmSize > 8u * MemoryConstants::kiloByte) { return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_SLM_ENCODES_16K; } if (slmSize > 4u * MemoryConstants::kiloByte) { return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_SLM_ENCODES_8K; } if (slmSize > 2u * MemoryConstants::kiloByte) { return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_SLM_ENCODES_4K; } if (slmSize > 1u * MemoryConstants::kiloByte) { return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_SLM_ENCODES_2K; } return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_SLM_ENCODES_1K; } template template void EncodeDispatchKernel::setupPreferredSlmSize(InterfaceDescriptorType *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy) { using PREFERRED_SLM_ALLOCATION_SIZE = typename InterfaceDescriptorType::PREFERRED_SLM_ALLOCATION_SIZE; auto &hwInfo = *rootDeviceEnvironment.getHardwareInfo(); const uint32_t threadsPerDssCount = EncodeDispatchKernel::getThreadCountPerSubslice(hwInfo); const uint32_t workGroupCountPerDss = static_cast(Math::divideAndRoundUp(threadsPerDssCount, threadsPerThreadGroup)); slmTotalSize = EncodeDispatchKernel::alignPreferredSlmSize(slmTotalSize); uint32_t slmSize = 0u; switch (slmPolicy) { case SlmPolicy::slmPolicyLargeData: slmSize = slmTotalSize; break; case SlmPolicy::slmPolicyLargeSlm: default: slmSize = slmTotalSize * workGroupCountPerDss; break; } constexpr bool isHeapless = Family::template isInterfaceDescriptorHeaplessMode(); auto releaseHelper = rootDeviceEnvironment.getReleaseHelper(); const auto &sizeToPreferredSlmValueArray = releaseHelper->getSizeToPreferredSlmValue(isHeapless); uint32_t programmableIdPreferredSlmSize = 0; for (auto &range : sizeToPreferredSlmValueArray) { if (slmSize <= range.upperLimit) { programmableIdPreferredSlmSize = range.valueToProgram; break; } } if (debugManager.flags.OverridePreferredSlmAllocationSizePerDss.get() != -1) { programmableIdPreferredSlmSize = static_cast(debugManager.flags.OverridePreferredSlmAllocationSizePerDss.get()); } pInterfaceDescriptor->setPreferredSlmAllocationSize(static_cast(programmableIdPreferredSlmSize)); } template size_t EncodeStates::getSshHeapSize() { return 2 * MemoryConstants::megaByte; } template template void EncodeDispatchKernel::overrideDefaultValues(WalkerType &walkerCmd, InterfaceDescriptorType &interfaceDescriptor) { int32_t forceL3PrefetchForComputeWalker = debugManager.flags.ForceL3PrefetchForComputeWalker.get(); if (forceL3PrefetchForComputeWalker != -1) { walkerCmd.setL3PrefetchDisable(!forceL3PrefetchForComputeWalker); } } template template void EncodeDispatchKernel::encodeThreadGroupDispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t *threadGroupDimensions, const uint32_t threadGroupCount, const uint32_t grfCount, const uint32_t threadsPerThreadGroup, WalkerType &walkerCmd) { const auto &productHelper = device.getProductHelper(); if (productHelper.isDisableOverdispatchAvailable(hwInfo)) { interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1); bool adjustTGDispatchSize = true; if (debugManager.flags.AdjustThreadGroupDispatchSize.get() != -1) { adjustTGDispatchSize = !!debugManager.flags.AdjustThreadGroupDispatchSize.get(); } // apply v2 algorithm only for parts where MaxSubSlicesSupported is equal to SubSliceCount auto algorithmVersion = hwInfo.gtSystemInfo.MaxSubSlicesSupported == hwInfo.gtSystemInfo.SubSliceCount ? 2 : 1; if (debugManager.flags.ForceThreadGroupDispatchSizeAlgorithm.get() != -1) { algorithmVersion = debugManager.flags.ForceThreadGroupDispatchSizeAlgorithm.get(); } auto tileCount = ImplicitScalingHelper::isImplicitScalingEnabled(device.getDeviceBitfield(), true) ? device.getNumSubDevices() : 1u; if (algorithmVersion == 2) { auto threadsPerXeCore = hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.MaxSubSlicesSupported; if (grfCount == 256) { threadsPerXeCore /= 2; } auto tgDispatchSizeSelected = 8; if (threadGroupDimensions[0] > 1 && (threadGroupDimensions[1] > 1 || threadGroupDimensions[2] > 1)) { while (threadGroupDimensions[0] % tgDispatchSizeSelected != 0) { tgDispatchSizeSelected /= 2; } } else if (threadGroupDimensions[1] > 1 && threadGroupDimensions[2] > 1) { while (threadGroupDimensions[1] % tgDispatchSizeSelected != 0) { tgDispatchSizeSelected /= 2; } } // make sure we fit all xe core while (threadGroupCount / tgDispatchSizeSelected < hwInfo.gtSystemInfo.MaxSubSlicesSupported * tileCount && tgDispatchSizeSelected > 1) { tgDispatchSizeSelected /= 2; } auto threadCountPerGrouping = tgDispatchSizeSelected * threadsPerThreadGroup; // make sure we do not use more threads then present on each xe core while (threadCountPerGrouping > threadsPerXeCore && tgDispatchSizeSelected > 1) { tgDispatchSizeSelected /= 2; threadCountPerGrouping /= 2; } if (tgDispatchSizeSelected == 8) { interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8); } else if (tgDispatchSizeSelected == 1) { interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1); } else if (tgDispatchSizeSelected == 2) { interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2); } else { interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4); } } else { if (adjustTGDispatchSize) { UNRECOVERABLE_IF(grfCount == 0u); constexpr uint32_t maxThreadsInTGForTGDispatchSize8 = 16u; constexpr uint32_t maxThreadsInTGForTGDispatchSize4 = 32u; auto &gfxCoreHelper = device.getGfxCoreHelper(); uint32_t availableThreadCount = gfxCoreHelper.calculateAvailableThreadCount(hwInfo, grfCount); availableThreadCount *= tileCount; uint32_t dispatchedTotalThreadCount = threadsPerThreadGroup * threadGroupCount; UNRECOVERABLE_IF(threadsPerThreadGroup == 0u); auto tgDispatchSizeSelected = 1u; if (dispatchedTotalThreadCount <= availableThreadCount) { tgDispatchSizeSelected = 1; } else if (threadsPerThreadGroup <= maxThreadsInTGForTGDispatchSize8) { tgDispatchSizeSelected = 8; } else if (threadsPerThreadGroup <= maxThreadsInTGForTGDispatchSize4) { tgDispatchSizeSelected = 4; } else { tgDispatchSizeSelected = 2; } if (threadGroupDimensions[0] > 1 && (threadGroupDimensions[1] > 1 || threadGroupDimensions[2] > 1)) { while (threadGroupDimensions[0] % tgDispatchSizeSelected != 0) { tgDispatchSizeSelected /= 2; } } else if (threadGroupDimensions[1] > 1 && threadGroupDimensions[2] > 1) { while (threadGroupDimensions[1] % tgDispatchSizeSelected != 0) { tgDispatchSizeSelected /= 2; } } if (tgDispatchSizeSelected == 8) { interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8); } else if (tgDispatchSizeSelected == 1) { interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1); } else if (tgDispatchSizeSelected == 2) { interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2); } else { interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4); } } } } if (debugManager.flags.ForceThreadGroupDispatchSize.get() != -1) { interfaceDescriptor.setThreadGroupDispatchSize(static_cast( debugManager.flags.ForceThreadGroupDispatchSize.get())); } } template template void EncodeDispatchKernel::encodeWalkerPostSyncFields(WalkerType &walkerCmd, const RootDeviceEnvironment &rootDeviceEnvironment, const EncodeWalkerArgs &walkerArgs) { auto programGlobalFenceAsPostSyncOperationInComputeWalker = rootDeviceEnvironment.getProductHelper().isGlobalFenceInPostSyncRequired(*rootDeviceEnvironment.getHardwareInfo()) && walkerArgs.requiredSystemFence; int32_t overrideProgramSystemMemoryFence = debugManager.flags.ProgramGlobalFenceAsPostSyncOperationInComputeWalker.get(); if (overrideProgramSystemMemoryFence != -1) { programGlobalFenceAsPostSyncOperationInComputeWalker = !!overrideProgramSystemMemoryFence; } auto &postSyncData = walkerCmd.getPostSync(); postSyncData.setSystemMemoryFenceRequest(programGlobalFenceAsPostSyncOperationInComputeWalker); } template void EncodeSurfaceState::encodeExtraCacheSettings(R_SURFACE_STATE *surfaceState, const EncodeSurfaceStateArgs &args) { using L1_CACHE_CONTROL = typename R_SURFACE_STATE::L1_CACHE_CONTROL; auto &productHelper = args.gmmHelper->getRootDeviceEnvironment().getHelper(); auto cachePolicy = static_cast(productHelper.getL1CachePolicy(args.isDebuggerActive)); if (debugManager.flags.OverrideL1CacheControlInSurfaceState.get() != -1 && debugManager.flags.ForceAllResourcesUncached.get() == false) { cachePolicy = static_cast(debugManager.flags.OverrideL1CacheControlInSurfaceState.get()); } surfaceState->setL1CacheControlCachePolicy(cachePolicy); } template void EncodeEnableRayTracing::programEnableRayTracing(LinearStream &commandStream, uint64_t backBuffer) { auto cmd = Family::cmd3dStateBtd; cmd.setPerDssMemoryBackedBufferSize(static_cast(RayTracingHelper::getMemoryBackedFifoSizeToPatch())); cmd.setMemoryBackedBufferBasePointer(backBuffer); append3dStateBtd(&cmd); *commandStream.getSpaceForCmd() = cmd; } template inline void EncodeWA::setAdditionalPipeControlFlagsForNonPipelineStateCommand(PipeControlArgs &args) { args.unTypedDataPortCacheFlush = true; } } // namespace NEO