/* * Copyright (C) 2020-2025 Intel Corporation * * SPDX-License-Identifier: MIT * */ #include "shared/source/command_container/command_encoder.h" #include "shared/source/command_container/command_encoder.inl" #include "shared/source/command_container/command_encoder_from_gen12lp_to_xe2_hpg.inl" #include "shared/source/command_container/command_encoder_gen12lp_and_xe_hpg.inl" #include "shared/source/command_container/command_encoder_pre_xe2_hpg_core.inl" #include "shared/source/command_container/command_encoder_tgllp_and_later.inl" #include "shared/source/command_container/encode_surface_state.h" #include "shared/source/command_stream/linear_stream.h" #include "shared/source/command_stream/memory_compression_state.h" #include "shared/source/command_stream/preemption.h" #include "shared/source/command_stream/stream_properties.h" #include "shared/source/execution_environment/execution_environment.h" #include "shared/source/gen12lp/hw_cmds_base.h" #include "shared/source/gen12lp/reg_configs.h" #include "shared/source/gmm_helper/gmm_helper.h" #include "shared/source/helpers/api_specific_config.h" #include "shared/source/helpers/cache_policy.h" #include "shared/source/helpers/gfx_core_helper.h" #include "shared/source/helpers/in_order_cmd_helpers.h" #include "shared/source/helpers/pause_on_gpu_properties.h" #include "shared/source/helpers/pipe_control_args.h" #include "shared/source/helpers/pipeline_select_args.h" #include "shared/source/helpers/preamble.h" #include "shared/source/helpers/simd_helper.h" #include "shared/source/helpers/state_base_address.h" #include "shared/source/kernel/dispatch_kernel_encoder_interface.h" #include "shared/source/kernel/implicit_args_helper.h" #include "shared/source/release_helper/release_helper.h" #include "encode_surface_state_args.h" #include using Family = NEO::Gen12LpFamily; #include "shared/source/command_container/command_encoder_heap_addressing.inl" #include "shared/source/command_stream/command_stream_receiver.h" namespace NEO { template template void EncodeDispatchKernel::setGrfInfo(InterfaceDescriptorType *pInterfaceDescriptor, uint32_t grfCount, const size_t &sizeCrossThreadData, const size_t &sizePerThreadData, const RootDeviceEnvironment &rootDeviceEnvironment) { auto grfSize = sizeof(typename Family::GRF); DEBUG_BREAK_IF((sizeCrossThreadData % grfSize) != 0); auto numGrfCrossThreadData = static_cast(sizeCrossThreadData / grfSize); DEBUG_BREAK_IF(numGrfCrossThreadData == 0); pInterfaceDescriptor->setCrossThreadConstantDataReadLength(numGrfCrossThreadData); DEBUG_BREAK_IF((sizePerThreadData % grfSize) != 0); auto numGrfPerThreadData = static_cast(sizePerThreadData / grfSize); // at least 1 GRF of perThreadData for each thread in a thread group when sizeCrossThreadData != 0 numGrfPerThreadData = std::max(numGrfPerThreadData, 1u); pInterfaceDescriptor->setConstantIndirectUrbEntryReadLength(numGrfPerThreadData); } template template void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDispatchKernelArgs &args) { using MEDIA_STATE_FLUSH = typename Family::MEDIA_STATE_FLUSH; using STATE_BASE_ADDRESS = typename Family::STATE_BASE_ADDRESS; auto &kernelDescriptor = args.dispatchInterface->getKernelDescriptor(); auto sizeCrossThreadData = args.dispatchInterface->getCrossThreadDataSize(); auto sizePerThreadData = args.dispatchInterface->getPerThreadDataSize(); auto sizePerThreadDataForWholeGroup = args.dispatchInterface->getPerThreadDataSizeForWholeThreadGroup(); auto pImplicitArgs = args.dispatchInterface->getImplicitArgs(); auto &hwInfo = args.device->getHardwareInfo(); auto &rootDeviceEnvironment = args.device->getRootDeviceEnvironment(); LinearStream *listCmdBufferStream = container.getCommandStream(); auto threadGroupDims = static_cast(args.threadGroupDimensions); DefaultWalkerType cmd = Family::cmdInitGpgpuWalker; auto idd = Family::cmdInitInterfaceDescriptorData; { auto alloc = args.dispatchInterface->getIsaAllocation(); UNRECOVERABLE_IF(nullptr == alloc); auto offset = alloc->getGpuAddressToPatch() + args.dispatchInterface->getIsaOffsetInParentAllocation(); idd.setKernelStartPointer(offset); idd.setKernelStartPointerHigh(0u); } if (args.dispatchInterface->getKernelDescriptor().kernelAttributes.flags.usesAssert && args.device->getL0Debugger() != nullptr) { idd.setSoftwareExceptionEnable(1); } auto numThreadsPerThreadGroup = args.dispatchInterface->getNumThreadsPerThreadGroup(); idd.setNumberOfThreadsInGpgpuThreadGroup(numThreadsPerThreadGroup); EncodeDispatchKernel::programBarrierEnable(idd, kernelDescriptor, hwInfo); auto slmSize = EncodeDispatchKernel::computeSlmValues(hwInfo, args.dispatchInterface->getSlmTotalSize(), nullptr, false); idd.setSharedLocalMemorySize(slmSize); uint32_t bindingTableStateCount = kernelDescriptor.payloadMappings.bindingTable.numEntries; uint32_t bindingTablePointer = 0u; bool isBindlessKernel = NEO::KernelDescriptor::isBindlessAddressingKernel(kernelDescriptor); if (!isBindlessKernel) { container.prepareBindfulSsh(); if (bindingTableStateCount > 0u) { auto ssh = args.surfaceStateHeap; if (ssh == nullptr) { ssh = container.getHeapWithRequiredSizeAndAlignment(HeapType::surfaceState, args.dispatchInterface->getSurfaceStateHeapDataSize(), NEO::EncodeDispatchKernel::getDefaultSshAlignment()); } bindingTablePointer = static_cast(EncodeSurfaceState::pushBindingTableAndSurfaceStates( *ssh, args.dispatchInterface->getSurfaceStateHeapData(), args.dispatchInterface->getSurfaceStateHeapDataSize(), bindingTableStateCount, kernelDescriptor.payloadMappings.bindingTable.tableOffset)); } } else { bool globalBindlessSsh = args.device->getBindlessHeapsHelper() != nullptr; auto sshHeapSize = args.dispatchInterface->getSurfaceStateHeapDataSize(); if (sshHeapSize > 0u) { auto ssh = args.surfaceStateHeap; if (ssh == nullptr) { container.prepareBindfulSsh(); ssh = container.getHeapWithRequiredSizeAndAlignment(HeapType::surfaceState, sshHeapSize, NEO::EncodeDispatchKernel::getDefaultSshAlignment()); } uint64_t bindlessSshBaseOffset = ptrDiff(ssh->getSpace(0), ssh->getCpuBase()); if (globalBindlessSsh) { bindlessSshBaseOffset += ptrDiff(ssh->getGraphicsAllocation()->getGpuAddress(), ssh->getGraphicsAllocation()->getGpuBaseAddress()); } DEBUG_BREAK_IF(bindingTableStateCount > 0u); // Allocate space for new ssh data auto dstSurfaceState = ssh->getSpace(sshHeapSize); memcpy_s(dstSurfaceState, sshHeapSize, args.dispatchInterface->getSurfaceStateHeapData(), sshHeapSize); args.dispatchInterface->patchBindlessOffsetsInCrossThreadData(bindlessSshBaseOffset); } } idd.setBindingTablePointer(bindingTablePointer); PreemptionHelper::programInterfaceDescriptorDataPreemption(&idd, args.preemptionMode); uint32_t samplerStateOffset = 0; uint32_t samplerCount = 0; if (kernelDescriptor.payloadMappings.samplerTable.numSamplers > 0) { auto dsHeap = args.dynamicStateHeap; if (dsHeap == nullptr) { dsHeap = container.getIndirectHeap(HeapType::dynamicState); auto dshSizeRequired = NEO::EncodeDispatchKernel::getSizeRequiredDsh(kernelDescriptor, container.getNumIddPerBlock()); if (dsHeap->getAvailableSpace() <= dshSizeRequired) { dsHeap = container.getHeapWithRequiredSizeAndAlignment(HeapType::dynamicState, dsHeap->getMaxAvailableSpace(), NEO::EncodeDispatchKernel::getDefaultDshAlignment()); } } UNRECOVERABLE_IF(!dsHeap); samplerCount = kernelDescriptor.payloadMappings.samplerTable.numSamplers; samplerStateOffset = EncodeStates::copySamplerState(dsHeap, kernelDescriptor.payloadMappings.samplerTable.tableOffset, kernelDescriptor.payloadMappings.samplerTable.numSamplers, kernelDescriptor.payloadMappings.samplerTable.borderColor, args.dispatchInterface->getDynamicStateHeapData(), args.device->getBindlessHeapsHelper(), args.device->getRootDeviceEnvironment()); } idd.setSamplerStatePointer(samplerStateOffset); if (!isBindlessKernel) { EncodeDispatchKernel::adjustBindingTablePrefetch(idd, samplerCount, bindingTableStateCount); } EncodeDispatchKernel::setGrfInfo(&idd, kernelDescriptor.kernelAttributes.numGrfRequired, sizeCrossThreadData, sizePerThreadData, rootDeviceEnvironment); uint32_t sizeThreadData = sizePerThreadDataForWholeGroup + sizeCrossThreadData; bool isHwLocalIdGeneration = false; uint32_t sizeForImplicitArgsPatching = NEO::ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, isHwLocalIdGeneration, rootDeviceEnvironment); uint32_t iohRequiredSize = sizeThreadData + sizeForImplicitArgsPatching; uint64_t offsetThreadData = 0u; { auto heapIndirect = container.getIndirectHeap(HeapType::indirectObject); UNRECOVERABLE_IF(!(heapIndirect)); heapIndirect->align(Family::cacheLineSize); void *ptr = nullptr; if (args.isKernelDispatchedFromImmediateCmdList) { ptr = container.getHeapWithRequiredSizeAndAlignment(HeapType::indirectObject, iohRequiredSize, DefaultWalkerType::INDIRECTDATASTARTADDRESS_ALIGN_SIZE)->getSpace(iohRequiredSize); } else { ptr = container.getHeapSpaceAllowGrow(HeapType::indirectObject, iohRequiredSize); } UNRECOVERABLE_IF(!(ptr)); offsetThreadData = heapIndirect->getHeapGpuStartOffset() + static_cast(heapIndirect->getUsed() - sizeThreadData); uint64_t implicitArgsGpuVA = 0u; if (pImplicitArgs) { implicitArgsGpuVA = heapIndirect->getGraphicsAllocation()->getGpuAddress() + static_cast(heapIndirect->getUsed() - iohRequiredSize); auto implicitArgsCrossThreadPtr = ptrOffset(const_cast(reinterpret_cast(args.dispatchInterface->getCrossThreadData())), kernelDescriptor.payloadMappings.implicitArgs.implicitArgsBuffer); *implicitArgsCrossThreadPtr = implicitArgsGpuVA; ptr = NEO::ImplicitArgsHelper::patchImplicitArgs(ptr, *pImplicitArgs, kernelDescriptor, {}, rootDeviceEnvironment, nullptr); } memcpy_s(ptr, sizeCrossThreadData, args.dispatchInterface->getCrossThreadData(), sizeCrossThreadData); if (args.isIndirect) { auto crossThreadDataGpuVA = heapIndirect->getGraphicsAllocation()->getGpuAddress() + heapIndirect->getUsed() - sizeThreadData; EncodeIndirectParams::encode(container, crossThreadDataGpuVA, args.dispatchInterface, implicitArgsGpuVA); } ptr = ptrOffset(ptr, sizeCrossThreadData); memcpy_s(ptr, sizePerThreadDataForWholeGroup, args.dispatchInterface->getPerThreadData(), sizePerThreadDataForWholeGroup); } uint32_t numIDD = 0u; void *iddPtr = getInterfaceDescriptor(container, args.dynamicStateHeap, numIDD); auto slmSizeNew = args.dispatchInterface->getSlmTotalSize(); bool dirtyHeaps = container.isAnyHeapDirty(); bool flush = container.slmSizeRef() != slmSizeNew || dirtyHeaps || args.requiresUncachedMocs; if (flush) { PipeControlArgs syncArgs; syncArgs.dcFlushEnable = args.dcFlushEnable; if (dirtyHeaps) { syncArgs.hdcPipelineFlush = true; } MemorySynchronizationCommands::addSingleBarrier(*container.getCommandStream(), syncArgs); if (dirtyHeaps || args.requiresUncachedMocs) { STATE_BASE_ADDRESS sba; auto gmmHelper = container.getDevice()->getGmmHelper(); uint32_t statelessMocsIndex = args.requiresUncachedMocs ? (gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER_CACHELINE_MISALIGNED) >> 1) : (gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER) >> 1); auto l1CachePolicy = container.l1CachePolicyDataRef()->getL1CacheValue(false); auto l1CachePolicyDebuggerActive = container.l1CachePolicyDataRef()->getL1CacheValue(true); EncodeStateBaseAddressArgs encodeStateBaseAddressArgs = { &container, // container sba, // sbaCmd nullptr, // sbaProperties statelessMocsIndex, // statelessMocsIndex l1CachePolicy, // l1CachePolicy l1CachePolicyDebuggerActive, // l1CachePolicyDebuggerActive false, // multiOsContextCapable args.isRcs, // isRcs container.doubleSbaWaRef(), // doubleSbaWa false, // heaplessModeEnabled }; EncodeStateBaseAddress::encode(encodeStateBaseAddressArgs); container.setDirtyStateForAllHeaps(false); args.requiresUncachedMocs = false; } if (container.slmSizeRef() != slmSizeNew) { EncodeL3State::encode(container, slmSizeNew != 0u); container.slmSizeRef() = slmSizeNew; } } if (numIDD == 0 || flush) { EncodeMediaInterfaceDescriptorLoad::encode(container, args.dynamicStateHeap); } cmd.setIndirectDataStartAddress(static_cast(offsetThreadData)); cmd.setIndirectDataLength(sizeThreadData); cmd.setInterfaceDescriptorOffset(numIDD); EncodeDispatchKernel::encodeThreadData(cmd, nullptr, threadGroupDims, args.dispatchInterface->getGroupSize(), kernelDescriptor.kernelAttributes.simdSize, kernelDescriptor.kernelAttributes.numLocalIdChannels, numThreadsPerThreadGroup, args.dispatchInterface->getThreadExecutionMask(), true, false, args.isIndirect, args.dispatchInterface->getRequiredWorkgroupOrder(), rootDeviceEnvironment); cmd.setPredicateEnable(args.isPredicate); auto threadGroupCount = cmd.getThreadGroupIdXDimension() * cmd.getThreadGroupIdYDimension() * cmd.getThreadGroupIdZDimension(); EncodeDispatchKernel::encodeThreadGroupDispatch(idd, *args.device, hwInfo, threadGroupDims, threadGroupCount, kernelDescriptor.kernelAttributes.numGrfRequired, numThreadsPerThreadGroup, cmd); EncodeWalkerArgs walkerArgs{ .kernelExecutionType = KernelExecutionType::defaultType, .requiredDispatchWalkOrder = args.requiredDispatchWalkOrder, .localRegionSize = args.localRegionSize, .maxFrontEndThreads = args.device->getDeviceInfo().maxFrontEndThreads, .requiredSystemFence = args.requiresSystemMemoryFence(), .hasSample = false}; EncodeDispatchKernel::encodeAdditionalWalkerFields(rootDeviceEnvironment, cmd, walkerArgs); EncodeDispatchKernel::encodeWalkerPostSyncFields(cmd, walkerArgs); EncodeDispatchKernel::template encodeComputeDispatchAllWalker(cmd, nullptr, rootDeviceEnvironment, walkerArgs); memcpy_s(iddPtr, sizeof(idd), &idd, sizeof(idd)); if (NEO::PauseOnGpuProperties::pauseModeAllowed(NEO::debugManager.flags.PauseOnEnqueue.get(), args.device->debugExecutionCounter.load(), NEO::PauseOnGpuProperties::PauseMode::BeforeWorkload)) { void *commandBuffer = listCmdBufferStream->getSpace(MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(args.device->getRootDeviceEnvironment(), false)); args.additionalCommands->push_back(commandBuffer); EncodeSemaphore::applyMiSemaphoreWaitCommand(*listCmdBufferStream, *args.additionalCommands); } auto buffer = listCmdBufferStream->getSpaceForCmd(); *buffer = cmd; { auto mediaStateFlush = listCmdBufferStream->getSpaceForCmd(); *mediaStateFlush = Family::cmdInitMediaStateFlush; } args.partitionCount = 1; if (NEO::PauseOnGpuProperties::pauseModeAllowed(NEO::debugManager.flags.PauseOnEnqueue.get(), args.device->debugExecutionCounter.load(), NEO::PauseOnGpuProperties::PauseMode::AfterWorkload)) { void *commandBuffer = listCmdBufferStream->getSpace(MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(args.device->getRootDeviceEnvironment(), false)); args.additionalCommands->push_back(commandBuffer); EncodeSemaphore::applyMiSemaphoreWaitCommand(*listCmdBufferStream, *args.additionalCommands); } } template void EncodeMediaInterfaceDescriptorLoad::encode(CommandContainer &container, IndirectHeap *childDsh) { using MEDIA_STATE_FLUSH = typename Family::MEDIA_STATE_FLUSH; using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename Family::MEDIA_INTERFACE_DESCRIPTOR_LOAD; void *heapBase = nullptr; if (childDsh != nullptr) { heapBase = childDsh->getCpuBase(); } else { heapBase = container.getIndirectHeap(HeapType::dynamicState)->getCpuBase(); } auto mediaStateFlush = container.getCommandStream()->getSpaceForCmd(); *mediaStateFlush = Family::cmdInitMediaStateFlush; auto iddOffset = static_cast(ptrDiff(container.getIddBlock(), heapBase)); MEDIA_INTERFACE_DESCRIPTOR_LOAD cmd = Family::cmdInitMediaInterfaceDescriptorLoad; cmd.setInterfaceDescriptorDataStartAddress(iddOffset); cmd.setInterfaceDescriptorTotalLength(sizeof(INTERFACE_DESCRIPTOR_DATA) * container.getNumIddPerBlock()); auto buffer = container.getCommandStream()->getSpace(sizeof(cmd)); *(decltype(cmd) *)buffer = cmd; } template inline bool EncodeDispatchKernel::isRuntimeLocalIdsGenerationRequired(uint32_t activeChannels, const size_t *lws, std::array walkOrder, bool requireInputWalkOrder, uint32_t &requiredWalkOrder, uint32_t simd) { requiredWalkOrder = 0u; return true; } template template void EncodeDispatchKernel::encodeThreadData(WalkerType &walkerCmd, const uint32_t *startWorkGroup, const uint32_t *numWorkGroups, const uint32_t *workGroupSizes, uint32_t simd, uint32_t localIdDimensions, uint32_t threadsPerThreadGroup, uint32_t threadExecutionMask, bool localIdsGenerationByRuntime, bool inlineDataProgrammingRequired, bool isIndirect, uint32_t requiredWorkGroupOrder, const RootDeviceEnvironment &rootDeviceEnvironment) { if (isIndirect) { walkerCmd.setIndirectParameterEnable(true); } else { walkerCmd.setThreadGroupIdXDimension(static_cast(numWorkGroups[0])); walkerCmd.setThreadGroupIdYDimension(static_cast(numWorkGroups[1])); walkerCmd.setThreadGroupIdZDimension(static_cast(numWorkGroups[2])); } if (startWorkGroup) { walkerCmd.setThreadGroupIdStartingX(static_cast(startWorkGroup[0])); walkerCmd.setThreadGroupIdStartingY(static_cast(startWorkGroup[1])); walkerCmd.setThreadGroupIdStartingResumeZ(static_cast(startWorkGroup[2])); } walkerCmd.setSimdSize(getSimdConfig(simd)); auto localWorkSize = static_cast(workGroupSizes[0] * workGroupSizes[1] * workGroupSizes[2]); if (threadsPerThreadGroup == 0) { threadsPerThreadGroup = getThreadsPerWG(simd, localWorkSize); } walkerCmd.setThreadWidthCounterMaximum(threadsPerThreadGroup); uint64_t executionMask = threadExecutionMask; if (executionMask == 0) { auto remainderSimdLanes = localWorkSize & (simd - 1); executionMask = maxNBitValue(remainderSimdLanes); if (!executionMask) executionMask = ~executionMask; } constexpr uint32_t maxDword = std::numeric_limits::max(); walkerCmd.setRightExecutionMask(static_cast(executionMask)); walkerCmd.setBottomExecutionMask(maxDword); } template template void EncodeDispatchKernel::programBarrierEnable(InterfaceDescriptorType &interfaceDescriptor, const KernelDescriptor &kernelDescriptor, const HardwareInfo &hwInfo) { interfaceDescriptor.setBarrierEnable(kernelDescriptor.kernelAttributes.barrierCount); } template template inline void EncodeDispatchKernel::encodeAdditionalWalkerFields(const RootDeviceEnvironment &rootDeviceEnvironment, WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs) {} template template inline void EncodeDispatchKernel::encodeWalkerPostSyncFields(WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs) {} template template inline void EncodeDispatchKernel::encodeComputeDispatchAllWalker(WalkerType &walkerCmd, const InterfaceDescriptorType *idd, const RootDeviceEnvironment &rootDeviceEnvironment, const EncodeWalkerArgs &walkerArgs) {} template template void EncodeDispatchKernel::setupPreferredSlmSize(InterfaceDescriptorType *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy) {} template inline bool EncodeDispatchKernel::isDshNeeded(const DeviceInfo &deviceInfo) { return true; } template void EncodeStateBaseAddress::setSbaAddressesForDebugger(NEO::Debugger::SbaAddresses &sbaAddress, const STATE_BASE_ADDRESS &sbaCmd) { sbaAddress.indirectObjectBaseAddress = sbaCmd.getIndirectObjectBaseAddress(); sbaAddress.bindlessSurfaceStateBaseAddress = sbaCmd.getBindlessSurfaceStateBaseAddress(); sbaAddress.dynamicStateBaseAddress = sbaCmd.getDynamicStateBaseAddress(); sbaAddress.generalStateBaseAddress = sbaCmd.getGeneralStateBaseAddress(); sbaAddress.instructionBaseAddress = sbaCmd.getInstructionBaseAddress(); sbaAddress.surfaceStateBaseAddress = sbaCmd.getSurfaceStateBaseAddress(); } template void EncodeStateBaseAddress::encode(EncodeStateBaseAddressArgs &args) { auto &device = *args.container->getDevice(); if (args.container->isAnyHeapDirty()) { EncodeWA::encodeAdditionalPipelineSelect(*args.container->getCommandStream(), {}, true, device.getRootDeviceEnvironment(), args.isRcs); } auto gmmHelper = device.getGmmHelper(); auto dsh = args.container->isHeapDirty(HeapType::dynamicState) ? args.container->getIndirectHeap(HeapType::dynamicState) : nullptr; auto ioh = args.container->isHeapDirty(HeapType::indirectObject) ? args.container->getIndirectHeap(HeapType::indirectObject) : nullptr; auto ssh = args.container->isHeapDirty(HeapType::surfaceState) ? args.container->getIndirectHeap(HeapType::surfaceState) : nullptr; auto isDebuggerActive = device.getDebugger() != nullptr; uint64_t globalHeapsBase = 0; uint64_t bindlessSurfStateBase = 0; bool useGlobalSshAndDsh = false; if (device.getBindlessHeapsHelper()) { bindlessSurfStateBase = device.getBindlessHeapsHelper()->getGlobalHeapsBase(); globalHeapsBase = device.getBindlessHeapsHelper()->getGlobalHeapsBase(); useGlobalSshAndDsh = true; } StateBaseAddressHelperArgs stateBaseAddressHelperArgs = { 0, // generalStateBaseAddress args.container->getIndirectObjectHeapBaseAddress(), // indirectObjectHeapBaseAddress args.container->getInstructionHeapBaseAddress(), // instructionHeapBaseAddress globalHeapsBase, // globalHeapsBaseAddress 0, // surfaceStateBaseAddress bindlessSurfStateBase, // bindlessSurfaceStateBaseAddress &args.sbaCmd, // stateBaseAddressCmd args.sbaProperties, // sbaProperties dsh, // dsh ioh, // ioh ssh, // ssh gmmHelper, // gmmHelper args.statelessMocsIndex, // statelessMocsIndex args.l1CachePolicy, // l1CachePolicy args.l1CachePolicyDebuggerActive, // l1CachePolicyDebuggerActive NEO::MemoryCompressionState::notApplicable, // memoryCompressionState false, // setInstructionStateBaseAddress false, // setGeneralStateBaseAddress useGlobalSshAndDsh, // useGlobalHeapsBaseAddress false, // isMultiOsContextCapable false, // areMultipleSubDevicesInContext false, // overrideSurfaceStateBaseAddress isDebuggerActive, // isDebuggerActive args.doubleSbaWa, // doubleSbaWa args.heaplessModeEnabled // heaplessModeEnabled }; StateBaseAddressHelper::programStateBaseAddressIntoCommandStream(stateBaseAddressHelperArgs, *args.container->getCommandStream()); EncodeWA::encodeAdditionalPipelineSelect(*args.container->getCommandStream(), {}, false, device.getRootDeviceEnvironment(), args.isRcs); } template size_t EncodeStateBaseAddress::getRequiredSizeForStateBaseAddress(Device &device, CommandContainer &container, bool isRcs) { return sizeof(typename Family::STATE_BASE_ADDRESS) + 2 * EncodeWA::getAdditionalPipelineSelectSize(device, isRcs); } template void EncodeMiFlushDW::adjust(MI_FLUSH_DW *miFlushDwCmd, const ProductHelper &productHelper) {} template inline void EncodeWA::addPipeControlPriorToNonPipelinedStateCommand(LinearStream &commandStream, PipeControlArgs args, const RootDeviceEnvironment &rootDeviceEnvironment, bool isRcs) { MemorySynchronizationCommands::addSingleBarrier(commandStream, args); } template inline void EncodeWA::adjustCompressionFormatForPlanarImage(uint32_t &compressionFormat, int plane) { } template void EncodeSurfaceState::setCoherencyType(R_SURFACE_STATE *surfaceState, COHERENCY_TYPE coherencyType) { surfaceState->setCoherencyType(coherencyType); } template void EncodeSemaphore::programMiSemaphoreWait(MI_SEMAPHORE_WAIT *cmd, uint64_t compareAddress, uint64_t compareData, COMPARE_OPERATION compareMode, bool registerPollMode, bool waitMode, bool useQwordData, bool indirect, bool switchOnUnsuccessful) { constexpr uint64_t upper32b = static_cast(std::numeric_limits::max()) << 32; UNRECOVERABLE_IF(useQwordData || (compareData & upper32b)); UNRECOVERABLE_IF(indirect); MI_SEMAPHORE_WAIT localCmd = Family::cmdInitMiSemaphoreWait; localCmd.setCompareOperation(compareMode); localCmd.setSemaphoreDataDword(static_cast(compareData)); localCmd.setSemaphoreGraphicsAddress(compareAddress); localCmd.setWaitMode(waitMode ? MI_SEMAPHORE_WAIT::WAIT_MODE::WAIT_MODE_POLLING_MODE : MI_SEMAPHORE_WAIT::WAIT_MODE::WAIT_MODE_SIGNAL_MODE); *cmd = localCmd; } template void EncodeEnableRayTracing::programEnableRayTracing(LinearStream &commandStream, uint64_t backBuffer) { } template inline void EncodeStoreMemory::programStoreDataImm(MI_STORE_DATA_IMM *cmdBuffer, uint64_t gpuAddress, uint32_t dataDword0, uint32_t dataDword1, bool storeQword, bool workloadPartitionOffset) { MI_STORE_DATA_IMM storeDataImmediate = Family::cmdInitStoreDataImm; storeDataImmediate.setAddress(gpuAddress); storeDataImmediate.setStoreQword(storeQword); storeDataImmediate.setDataDword0(dataDword0); if (storeQword) { storeDataImmediate.setDataDword1(dataDword1); storeDataImmediate.setDwordLength(MI_STORE_DATA_IMM::DWORD_LENGTH::DWORD_LENGTH_STORE_QWORD); } else { storeDataImmediate.setDwordLength(MI_STORE_DATA_IMM::DWORD_LENGTH::DWORD_LENGTH_STORE_DWORD); } EncodeStoreMemory::encodeForceCompletionCheck(storeDataImmediate); *cmdBuffer = storeDataImmediate; } template template void EncodeDispatchKernel::setupPostSyncMocs(WalkerType &walkerCmd, const RootDeviceEnvironment &rootDeviceEnvironment, bool dcFlush) {} template template void EncodeDispatchKernel::setupPostSyncForRegularEvent(WalkerType &walkerCmd, const EncodeDispatchKernelArgs &args) {} template template void EncodeDispatchKernel::encodeL3FlushAfterPostSync(WalkerType &walkerCmd, const EncodeDispatchKernelArgs &args) {} template template void EncodeDispatchKernel::setupPostSyncForInOrderExec(WalkerType &walkerCmd, const EncodeDispatchKernelArgs &args) {} template template void EncodeDispatchKernel::adjustWalkOrder(WalkerType &walkerCmd, uint32_t requiredWorkGroupOrder, const RootDeviceEnvironment &rootDeviceEnvironment) {} template size_t EncodeDispatchKernel::additionalSizeRequiredDsh(uint32_t iddCount) { return iddCount * sizeof(typename Family::INTERFACE_DESCRIPTOR_DATA); } template inline size_t EncodeDispatchKernel::getInlineDataOffset(EncodeDispatchKernelArgs &args) { return 0; } template template void EncodeDispatchKernel::forceComputeWalkerPostSyncFlushWithWrite(WalkerType &walkerCmd) { } template uint32_t EncodeDispatchKernel::alignSlmSize(uint32_t slmSize) { if (slmSize == 0u) { return 0u; } slmSize = std::max(slmSize, 1024u); slmSize = Math::nextPowerOfTwo(slmSize); UNRECOVERABLE_IF(slmSize > 64u * MemoryConstants::kiloByte); return slmSize; } template uint32_t EncodeDispatchKernel::computeSlmValues(const HardwareInfo &hwInfo, uint32_t slmSize, ReleaseHelper *releaseHelper, bool isHeapless) { auto value = std::max(slmSize, 1024u); value = Math::nextPowerOfTwo(value); value = Math::getMinLsbSet(value); value = value - 9; DEBUG_BREAK_IF(value > 7); return value * !!slmSize; } template bool EncodeDispatchKernel::singleTileExecImplicitScalingRequired(bool cooperativeKernel) { return cooperativeKernel; } template size_t EncodeStates::getSshHeapSize() { return 64 * MemoryConstants::kiloByte; } template void InOrderPatchCommandHelpers::PatchCmd::patchComputeWalker(uint64_t appendCounterValue) { UNRECOVERABLE_IF(true); } template template void EncodeDispatchKernel::overrideDefaultValues(WalkerType &walkerCmd, InterfaceDescriptorType &interfaceDescriptor) { } template template void EncodeDispatchKernel::encodeThreadGroupDispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t *threadGroupDimensions, const uint32_t threadGroupCount, const uint32_t grfCount, const uint32_t threadsPerThreadGroup, WalkerType &walkerCmd) { } template size_t EncodeDispatchKernel::getScratchPtrOffsetOfImplicitArgs() { return 0; } template void EncodeSurfaceState::setPitchForScratch(R_SURFACE_STATE *surfaceState, uint32_t pitch, const ProductHelper &productHelper) { surfaceState->setSurfacePitch(pitch); } template uint32_t EncodeSurfaceState::getPitchForScratchInBytes(R_SURFACE_STATE *surfaceState, const ProductHelper &productHelper) { return surfaceState->getSurfacePitch(); } template void EncodeSemaphore::appendSemaphoreCommand(MI_SEMAPHORE_WAIT &cmd, uint64_t compareData, bool indirect, bool useQwordData, bool switchOnUnsuccessful) { constexpr uint64_t upper32b = static_cast(std::numeric_limits::max()) << 32; UNRECOVERABLE_IF(useQwordData || (compareData & upper32b)); } template template void EncodeDispatchKernel::setScratchAddress(uint64_t &scratchAddress, uint32_t requiredScratchSlot0Size, uint32_t requiredScratchSlot1Size, IndirectHeap *ssh, CommandStreamReceiver &submissionCsr) { } template template void EncodeDispatchKernel::encodeEuSchedulingPolicy(InterfaceDescriptorType *pInterfaceDescriptor, const KernelDescriptor &kernelDesc, int32_t defaultPipelinedThreadArbitrationPolicy) { } template template void EncodeDispatchKernel::setWalkerRegionSettings(WalkerType &walkerCmd, const NEO::Device &device, uint32_t partitionCount, uint32_t workgroupSize, uint32_t threadGroupCount, uint32_t maxWgCountPerTile, bool requiredDispatchWalkOrder) {} template template void EncodeDispatchKernel::adjustTimestampPacket(WalkerType &walkerCmd, const EncodeDispatchKernelArgs &args) {} template <> size_t EncodeWA::getAdditionalPipelineSelectSize(Device &device, bool isRcs) { size_t size = 0; const auto &productHelper = device.getProductHelper(); if (isRcs && productHelper.is3DPipelineSelectWARequired()) { size += 2 * PreambleHelper::getCmdSizeForPipelineSelect(device.getRootDeviceEnvironment()); } return size; } template <> void EncodeComputeMode::programComputeModeCommand(LinearStream &csr, StateComputeModeProperties &properties, const RootDeviceEnvironment &rootDeviceEnvironment) { using STATE_COMPUTE_MODE = typename Family::STATE_COMPUTE_MODE; using FORCE_NON_COHERENT = typename STATE_COMPUTE_MODE::FORCE_NON_COHERENT; STATE_COMPUTE_MODE stateComputeMode = Family::cmdInitStateComputeMode; auto maskBits = stateComputeMode.getMaskBits(); FORCE_NON_COHERENT coherencyValue = (properties.isCoherencyRequired.value == 1) ? FORCE_NON_COHERENT::FORCE_NON_COHERENT_FORCE_DISABLED : FORCE_NON_COHERENT::FORCE_NON_COHERENT_FORCE_GPU_NON_COHERENT; stateComputeMode.setForceNonCoherent(coherencyValue); maskBits |= Family::stateComputeModeForceNonCoherentMask; stateComputeMode.setMaskBits(maskBits); auto buffer = csr.getSpace(sizeof(STATE_COMPUTE_MODE)); *reinterpret_cast(buffer) = stateComputeMode; } template <> void EncodeWA::encodeAdditionalPipelineSelect(LinearStream &stream, const PipelineSelectArgs &args, bool is3DPipeline, const RootDeviceEnvironment &rootDeviceEnvironment, bool isRcs) { const auto &productHelper = rootDeviceEnvironment.getHelper(); if (productHelper.is3DPipelineSelectWARequired() && isRcs) { PipelineSelectArgs pipelineSelectArgs = args; pipelineSelectArgs.is3DPipelineRequired = is3DPipeline; PreambleHelper::programPipelineSelect(&stream, pipelineSelectArgs, rootDeviceEnvironment); } } template <> void EncodeSurfaceState::encodeExtraBufferParams(EncodeSurfaceStateArgs &args) { auto surfaceState = reinterpret_cast(args.outMemory); const bool isL3Allowed = surfaceState->getMemoryObjectControlState() == args.gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER); if (isL3Allowed) { const bool isConstantSurface = args.allocation && args.allocation->getAllocationType() == AllocationType::constantSurface; bool useL1 = args.isReadOnly || isConstantSurface; if (debugManager.flags.ForceL1Caching.get() != 1) { useL1 = false; } if (useL1) { surfaceState->setMemoryObjectControlState(args.gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER_CONST)); } } } template <> void EncodeL3State::encode(CommandContainer &container, bool enableSLM) { } template <> void EncodeStoreMMIO::appendFlags(MI_STORE_REGISTER_MEM *storeRegMem, bool workloadPartition) { storeRegMem->setMmioRemapEnable(true); } template <> void EncodeSurfaceState::appendImageCompressionParams(R_SURFACE_STATE *surfaceState, GraphicsAllocation *allocation, GmmHelper *gmmHelper, bool imageFromBuffer, GMM_YUV_PLANE_ENUM plane) { } template <> inline void EncodeSurfaceState::encodeExtraCacheSettings(R_SURFACE_STATE *surfaceState, const EncodeSurfaceStateArgs &args) {} template <> inline void EncodeWA::setAdditionalPipeControlFlagsForNonPipelineStateCommand(PipeControlArgs &args) {} template <> bool EncodeEnableRayTracing::is48bResourceNeededForRayTracing() { return true; } } // namespace NEO #include "shared/source/command_container/command_encoder_enablers.inl" namespace NEO { template struct EncodeL3State; template void InOrderPatchCommandHelpers::PatchCmd::patchComputeWalker(uint64_t appendCounterValue); } // namespace NEO #include "shared/source/command_container/implicit_scaling_before_xe_hp.inl"