/* * Copyright (C) 2020 Intel Corporation * * SPDX-License-Identifier: MIT * */ #pragma once #include "shared/source/command_container/command_encoder.h" #include "shared/source/command_stream/linear_stream.h" #include "shared/source/command_stream/preemption.h" #include "shared/source/execution_environment/execution_environment.h" #include "shared/source/gmm_helper/gmm_helper.h" #include "shared/source/helpers/api_specific_config.h" #include "shared/source/helpers/hw_helper.h" #include "shared/source/helpers/simd_helper.h" #include "shared/source/helpers/state_base_address.h" #include "shared/source/kernel/dispatch_kernel_encoder_interface.h" #include "pipe_control_args.h" #include namespace NEO { template void EncodeDispatchKernel::encode(CommandContainer &container, const void *pThreadGroupDimensions, bool isIndirect, bool isPredicate, DispatchKernelEncoderI *dispatchInterface, uint64_t eventAddress, Device *device, PreemptionMode preemptionMode, bool &requiresUncachedMocs) { using MEDIA_STATE_FLUSH = typename Family::MEDIA_STATE_FLUSH; using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename Family::MEDIA_INTERFACE_DESCRIPTOR_LOAD; using MI_BATCH_BUFFER_END = typename Family::MI_BATCH_BUFFER_END; using STATE_BASE_ADDRESS = typename Family::STATE_BASE_ADDRESS; auto &kernelDescriptor = dispatchInterface->getKernelDescriptor(); auto sizeCrossThreadData = dispatchInterface->getCrossThreadDataSize(); auto sizePerThreadData = dispatchInterface->getPerThreadDataSize(); auto sizePerThreadDataForWholeGroup = dispatchInterface->getPerThreadDataSizeForWholeThreadGroup(); const HardwareInfo &hwInfo = device->getHardwareInfo(); LinearStream *listCmdBufferStream = container.getCommandStream(); size_t sshOffset = 0; size_t estimatedSizeRequired = estimateEncodeDispatchKernelCmdsSize(device); if (container.getCommandStream()->getAvailableSpace() < estimatedSizeRequired) { auto bbEnd = listCmdBufferStream->getSpaceForCmd(); *bbEnd = Family::cmdInitBatchBufferEnd; container.allocateNextCommandBuffer(); } WALKER_TYPE cmd = Family::cmdInitGpgpuWalker; auto idd = Family::cmdInitInterfaceDescriptorData; { auto alloc = dispatchInterface->getIsaAllocation(); UNRECOVERABLE_IF(nullptr == alloc); auto offset = alloc->getGpuAddressToPatch(); idd.setKernelStartPointer(offset); idd.setKernelStartPointerHigh(0u); } EncodeWA::encodeAdditionalPipelineSelect(*container.getDevice(), *container.getCommandStream(), true); EncodeStates::adjustStateComputeMode(*container.getCommandStream(), container.lastSentNumGrfRequired, nullptr, false, false); EncodeWA::encodeAdditionalPipelineSelect(*container.getDevice(), *container.getCommandStream(), false); auto numThreadsPerThreadGroup = dispatchInterface->getNumThreadsPerThreadGroup(); idd.setNumberOfThreadsInGpgpuThreadGroup(numThreadsPerThreadGroup); EncodeDispatchKernel::programBarrierEnable(idd, kernelDescriptor.kernelAttributes.barrierCount, hwInfo); auto slmSize = static_cast( HwHelperHw::get().computeSlmValues(hwInfo, dispatchInterface->getSlmTotalSize())); idd.setSharedLocalMemorySize(slmSize); uint32_t bindingTableStateCount = kernelDescriptor.payloadMappings.bindingTable.numEntries; uint32_t bindingTablePointer = 0u; bool isBindlessKernel = kernelDescriptor.kernelAttributes.bufferAddressingMode == KernelDescriptor::BindlessAndStateless; if (!isBindlessKernel) { if (bindingTableStateCount > 0u) { auto ssh = container.getHeapWithRequiredSizeAndAlignment(HeapType::SURFACE_STATE, dispatchInterface->getSurfaceStateHeapDataSize(), BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE); sshOffset = ssh->getUsed(); bindingTablePointer = static_cast(EncodeSurfaceState::pushBindingTableAndSurfaceStates( *ssh, bindingTableStateCount, dispatchInterface->getSurfaceStateHeapData(), dispatchInterface->getSurfaceStateHeapDataSize(), bindingTableStateCount, kernelDescriptor.payloadMappings.bindingTable.tableOffset)); } idd.setBindingTablePointer(bindingTablePointer); } PreemptionHelper::programInterfaceDescriptorDataPreemption(&idd, preemptionMode); auto heap = ApiSpecificConfig::getBindlessConfiguration() ? device->getBindlessHeapsHelper()->getHeap(BindlessHeapsHelper::GLOBAL_DSH) : container.getIndirectHeap(HeapType::DYNAMIC_STATE); UNRECOVERABLE_IF(!heap); uint32_t samplerStateOffset = 0; uint32_t samplerCount = 0; if (kernelDescriptor.payloadMappings.samplerTable.numSamplers > 0) { samplerCount = kernelDescriptor.payloadMappings.samplerTable.numSamplers; samplerStateOffset = EncodeStates::copySamplerState(heap, kernelDescriptor.payloadMappings.samplerTable.tableOffset, kernelDescriptor.payloadMappings.samplerTable.numSamplers, kernelDescriptor.payloadMappings.samplerTable.borderColor, dispatchInterface->getDynamicStateHeapData(), device->getBindlessHeapsHelper()); if (ApiSpecificConfig::getBindlessConfiguration()) { container.getResidencyContainer().push_back(device->getBindlessHeapsHelper()->getHeap(NEO::BindlessHeapsHelper::BindlesHeapType::GLOBAL_DSH)->getGraphicsAllocation()); } } idd.setSamplerStatePointer(samplerStateOffset); EncodeDispatchKernel::adjustBindingTablePrefetch(idd, samplerCount, bindingTableStateCount); auto numGrfCrossThreadData = static_cast(sizeCrossThreadData / sizeof(float[8])); idd.setCrossThreadConstantDataReadLength(numGrfCrossThreadData); auto numGrfPerThreadData = static_cast(sizePerThreadData / sizeof(float[8])); DEBUG_BREAK_IF(numGrfPerThreadData <= 0u); idd.setConstantIndirectUrbEntryReadLength(numGrfPerThreadData); uint32_t sizeThreadData = sizePerThreadDataForWholeGroup + sizeCrossThreadData; uint64_t offsetThreadData = 0u; { auto heapIndirect = container.getIndirectHeap(HeapType::INDIRECT_OBJECT); UNRECOVERABLE_IF(!(heapIndirect)); heapIndirect->align(WALKER_TYPE::INDIRECTDATASTARTADDRESS_ALIGN_SIZE); auto ptr = container.getHeapSpaceAllowGrow(HeapType::INDIRECT_OBJECT, sizeThreadData); UNRECOVERABLE_IF(!(ptr)); offsetThreadData = heapIndirect->getHeapGpuStartOffset() + static_cast(heapIndirect->getUsed() - sizeThreadData); memcpy_s(ptr, sizeCrossThreadData, dispatchInterface->getCrossThreadData(), sizeCrossThreadData); if (isIndirect) { void *gpuPtr = reinterpret_cast(heapIndirect->getHeapGpuBase() + heapIndirect->getUsed() - sizeThreadData); EncodeIndirectParams::setGroupCountIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups, gpuPtr); EncodeIndirectParams::setGlobalWorkSizeIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize, gpuPtr, dispatchInterface->getGroupSize()); } ptr = ptrOffset(ptr, sizeCrossThreadData); memcpy_s(ptr, sizePerThreadDataForWholeGroup, dispatchInterface->getPerThreadData(), sizePerThreadDataForWholeGroup); } auto slmSizeNew = dispatchInterface->getSlmTotalSize(); bool dirtyHeaps = container.isAnyHeapDirty(); bool flush = container.slmSize != slmSizeNew || dirtyHeaps || requiresUncachedMocs; if (flush) { PipeControlArgs args(true); if (dirtyHeaps) { args.hdcPipelineFlush = true; } MemorySynchronizationCommands::addPipeControl(*container.getCommandStream(), args); if (dirtyHeaps || requiresUncachedMocs) { STATE_BASE_ADDRESS sba; auto gmmHelper = container.getDevice()->getGmmHelper(); uint32_t statelessMocsIndex = requiresUncachedMocs ? (gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER_CACHELINE_MISALIGNED) >> 1) : (gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER) >> 1); EncodeStateBaseAddress::encode(container, sba, statelessMocsIndex); container.setDirtyStateForAllHeaps(false); requiresUncachedMocs = false; } if (container.slmSize != slmSizeNew) { EncodeL3State::encode(container, slmSizeNew != 0u); container.slmSize = slmSizeNew; if (container.nextIddInBlock != container.getNumIddPerBlock()) { EncodeMediaInterfaceDescriptorLoad::encode(container); } } } uint32_t numIDD = 0u; void *ptr = getInterfaceDescriptor(container, numIDD); memcpy_s(ptr, sizeof(idd), &idd, sizeof(idd)); cmd.setIndirectDataStartAddress(static_cast(offsetThreadData)); cmd.setIndirectDataLength(sizeThreadData); cmd.setInterfaceDescriptorOffset(numIDD); EncodeDispatchKernel::encodeThreadData(cmd, nullptr, static_cast(pThreadGroupDimensions), dispatchInterface->getGroupSize(), kernelDescriptor.kernelAttributes.simdSize, kernelDescriptor.kernelAttributes.numLocalIdChannels, dispatchInterface->getNumThreadsPerThreadGroup(), dispatchInterface->getThreadExecutionMask(), true, false, isIndirect, dispatchInterface->getRequiredWorkgroupOrder()); cmd.setPredicateEnable(isPredicate); EncodeDispatchKernel::adjustInterfaceDescriptorData(idd, hwInfo); PreemptionHelper::applyPreemptionWaCmdsBegin(listCmdBufferStream, *device); auto buffer = listCmdBufferStream->getSpace(sizeof(cmd)); *(decltype(cmd) *)buffer = cmd; PreemptionHelper::applyPreemptionWaCmdsEnd(listCmdBufferStream, *device); { auto mediaStateFlush = listCmdBufferStream->getSpace(sizeof(MEDIA_STATE_FLUSH)); *reinterpret_cast(mediaStateFlush) = Family::cmdInitMediaStateFlush; } } template void EncodeMediaInterfaceDescriptorLoad::encode(CommandContainer &container) { using MEDIA_STATE_FLUSH = typename Family::MEDIA_STATE_FLUSH; using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename Family::MEDIA_INTERFACE_DESCRIPTOR_LOAD; auto heap = container.getIndirectHeap(HeapType::DYNAMIC_STATE); auto mediaStateFlush = container.getCommandStream()->getSpaceForCmd(); *mediaStateFlush = Family::cmdInitMediaStateFlush; MEDIA_INTERFACE_DESCRIPTOR_LOAD cmd = Family::cmdInitMediaInterfaceDescriptorLoad; cmd.setInterfaceDescriptorDataStartAddress(static_cast(ptrDiff(container.getIddBlock(), heap->getCpuBase()))); cmd.setInterfaceDescriptorTotalLength(sizeof(INTERFACE_DESCRIPTOR_DATA) * container.getNumIddPerBlock()); auto buffer = container.getCommandStream()->getSpace(sizeof(cmd)); *(decltype(cmd) *)buffer = cmd; } template bool EncodeDispatchKernel::isRuntimeLocalIdsGenerationRequired(uint32_t activeChannels, size_t *lws, std::array walkOrder, bool requireInputWalkOrder, uint32_t &requiredWalkOrder, uint32_t simd) { requiredWalkOrder = 0u; return true; } template void EncodeDispatchKernel::encodeThreadData(WALKER_TYPE &walkerCmd, const uint32_t *startWorkGroup, const uint32_t *numWorkGroups, const uint32_t *workGroupSizes, uint32_t simd, uint32_t localIdDimensions, uint32_t threadsPerThreadGroup, uint32_t threadExecutionMask, bool localIdsGenerationByRuntime, bool inlineDataProgrammingRequired, bool isIndirect, uint32_t requiredWorkGroupOrder) { if (isIndirect) { walkerCmd.setIndirectParameterEnable(true); } else { walkerCmd.setThreadGroupIdXDimension(static_cast(numWorkGroups[0])); walkerCmd.setThreadGroupIdYDimension(static_cast(numWorkGroups[1])); walkerCmd.setThreadGroupIdZDimension(static_cast(numWorkGroups[2])); } if (startWorkGroup) { walkerCmd.setThreadGroupIdStartingX(static_cast(startWorkGroup[0])); walkerCmd.setThreadGroupIdStartingY(static_cast(startWorkGroup[1])); walkerCmd.setThreadGroupIdStartingResumeZ(static_cast(startWorkGroup[2])); } walkerCmd.setSimdSize(getSimdConfig(simd)); auto localWorkSize = workGroupSizes[0] * workGroupSizes[1] * workGroupSizes[2]; if (threadsPerThreadGroup == 0) { threadsPerThreadGroup = static_cast(getThreadsPerWG(simd, localWorkSize)); } walkerCmd.setThreadWidthCounterMaximum(threadsPerThreadGroup); uint64_t executionMask = threadExecutionMask; if (executionMask == 0) { auto remainderSimdLanes = localWorkSize & (simd - 1); executionMask = maxNBitValue(remainderSimdLanes); if (!executionMask) executionMask = ~executionMask; } constexpr uint32_t maxDword = std::numeric_limits::max(); walkerCmd.setRightExecutionMask(static_cast(executionMask)); walkerCmd.setBottomExecutionMask(maxDword); } template void EncodeDispatchKernel::programBarrierEnable(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t value, const HardwareInfo &hwInfo) { interfaceDescriptor.setBarrierEnable(value); } template void EncodeDispatchKernel::encodeAdditionalWalkerFields(const HardwareInfo &hwInfo, WALKER_TYPE &walkerCmd) {} template void EncodeDispatchKernel::appendAdditionalIDDFields(INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, const HardwareInfo &hwInfo, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy) {} template size_t EncodeDispatchKernel::estimateEncodeDispatchKernelCmdsSize(Device *device) { using MEDIA_STATE_FLUSH = typename Family::MEDIA_STATE_FLUSH; using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename Family::MEDIA_INTERFACE_DESCRIPTOR_LOAD; using MI_BATCH_BUFFER_END = typename Family::MI_BATCH_BUFFER_END; size_t issueMediaInterfaceDescriptorLoad = sizeof(MEDIA_STATE_FLUSH) + sizeof(MEDIA_INTERFACE_DESCRIPTOR_LOAD); size_t totalSize = sizeof(WALKER_TYPE); totalSize += PreemptionHelper::getPreemptionWaCsSize(*device); totalSize += sizeof(MEDIA_STATE_FLUSH); totalSize += issueMediaInterfaceDescriptorLoad; totalSize += EncodeStates::getAdjustStateComputeModeSize(); totalSize += EncodeWA::getAdditionalPipelineSelectSize(*device); totalSize += EncodeIndirectParams::getCmdsSizeForIndirectParams(); totalSize += EncodeIndirectParams::getCmdsSizeForSetGroupCountIndirect(); totalSize += EncodeIndirectParams::getCmdsSizeForSetGroupSizeIndirect(); totalSize += sizeof(MI_BATCH_BUFFER_END); return totalSize; } template void EncodeStateBaseAddress::encode(CommandContainer &container, STATE_BASE_ADDRESS &sbaCmd) { auto gmmHelper = container.getDevice()->getRootDeviceEnvironment().getGmmHelper(); uint32_t statelessMocsIndex = (gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER) >> 1); EncodeStateBaseAddress::encode(container, sbaCmd, statelessMocsIndex); } template void EncodeStateBaseAddress::encode(CommandContainer &container, STATE_BASE_ADDRESS &sbaCmd, uint32_t statelessMocsIndex) { EncodeWA::encodeAdditionalPipelineSelect(*container.getDevice(), *container.getCommandStream(), true); auto gmmHelper = container.getDevice()->getGmmHelper(); StateBaseAddressHelper::programStateBaseAddress( &sbaCmd, container.isHeapDirty(HeapType::DYNAMIC_STATE) ? container.getIndirectHeap(HeapType::DYNAMIC_STATE) : nullptr, container.isHeapDirty(HeapType::INDIRECT_OBJECT) ? container.getIndirectHeap(HeapType::INDIRECT_OBJECT) : nullptr, container.isHeapDirty(HeapType::SURFACE_STATE) ? container.getIndirectHeap(HeapType::SURFACE_STATE) : nullptr, 0, false, statelessMocsIndex, container.getIndirectObjectHeapBaseAddress(), container.getInstructionHeapBaseAddress(), false, gmmHelper, false, MemoryCompressionState::NotApplicable); auto pCmd = reinterpret_cast(container.getCommandStream()->getSpace(sizeof(STATE_BASE_ADDRESS))); *pCmd = sbaCmd; EncodeWA::encodeAdditionalPipelineSelect(*container.getDevice(), *container.getCommandStream(), false); } template void EncodeL3State::encode(CommandContainer &container, bool enableSLM) { auto offset = L3CNTLRegisterOffset::registerOffset; auto data = PreambleHelper::getL3Config(container.getDevice()->getHardwareInfo(), enableSLM); EncodeSetMMIO::encodeIMM(container, offset, data, false); } template void EncodeMiFlushDW::appendMiFlushDw(MI_FLUSH_DW *miFlushDwCmd) {} template void EncodeMiFlushDW::programMiFlushDwWA(LinearStream &commandStream) {} template size_t EncodeMiFlushDW::getMiFlushDwWaSize() { return 0; } template inline void EncodeWA::encodeAdditionalPipelineSelect(Device &device, LinearStream &stream, bool is3DPipeline) {} template inline size_t EncodeWA::getAdditionalPipelineSelectSize(Device &device) { return 0; } template void EncodeSurfaceState::encodeExtraBufferParams(R_SURFACE_STATE *surfaceState, GraphicsAllocation *allocation, GmmHelper *gmmHelper, bool isReadOnly, uint32_t numAvailableDevices) { encodeExtraCacheSettings(surfaceState, *gmmHelper->getHardwareInfo()); } template bool EncodeSurfaceState::doBindingTablePrefetch() { return true; } template void EncodeSempahore::programMiSemaphoreWait(MI_SEMAPHORE_WAIT *cmd, uint64_t compareAddress, uint32_t compareData, COMPARE_OPERATION compareMode, bool registerPollMode) { MI_SEMAPHORE_WAIT localCmd = Family::cmdInitMiSemaphoreWait; localCmd.setCompareOperation(compareMode); localCmd.setSemaphoreDataDword(compareData); localCmd.setSemaphoreGraphicsAddress(compareAddress); localCmd.setWaitMode(MI_SEMAPHORE_WAIT::WAIT_MODE::WAIT_MODE_POLLING_MODE); *cmd = localCmd; } } // namespace NEO