/* * Copyright (C) 2020-2021 Intel Corporation * * SPDX-License-Identifier: MIT * */ #pragma once #include "shared/source/command_container/command_encoder.h" #include "shared/source/command_stream/linear_stream.h" #include "shared/source/command_stream/preemption.h" #include "shared/source/execution_environment/execution_environment.h" #include "shared/source/gmm_helper/gmm_helper.h" #include "shared/source/helpers/api_specific_config.h" #include "shared/source/helpers/hw_helper.h" #include "shared/source/helpers/simd_helper.h" #include "shared/source/helpers/state_base_address.h" #include "shared/source/kernel/dispatch_kernel_encoder_interface.h" #include "shared/source/kernel/implicit_args.h" #include "pipe_control_args.h" #include namespace NEO { template void EncodeDispatchKernel::encode(CommandContainer &container, const void *pThreadGroupDimensions, bool isIndirect, bool isPredicate, DispatchKernelEncoderI *dispatchInterface, uint64_t eventAddress, bool isTimestampEvent, bool L3FlushEnable, Device *device, PreemptionMode preemptionMode, bool &requiresUncachedMocs, bool useGlobalAtomics, uint32_t &partitionCount, bool isInternal, bool isCooperative) { using MEDIA_STATE_FLUSH = typename Family::MEDIA_STATE_FLUSH; using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename Family::MEDIA_INTERFACE_DESCRIPTOR_LOAD; using MI_BATCH_BUFFER_END = typename Family::MI_BATCH_BUFFER_END; using STATE_BASE_ADDRESS = typename Family::STATE_BASE_ADDRESS; auto &kernelDescriptor = dispatchInterface->getKernelDescriptor(); auto sizeCrossThreadData = dispatchInterface->getCrossThreadDataSize(); auto sizePerThreadData = dispatchInterface->getPerThreadDataSize(); auto sizePerThreadDataForWholeGroup = dispatchInterface->getPerThreadDataSizeForWholeThreadGroup(); auto pImplicitArgs = dispatchInterface->getImplicitArgs(); const HardwareInfo &hwInfo = device->getHardwareInfo(); LinearStream *listCmdBufferStream = container.getCommandStream(); size_t sshOffset = 0; auto threadDims = static_cast(pThreadGroupDimensions); const Vec3 threadStartVec{0, 0, 0}; Vec3 threadDimsVec{0, 0, 0}; if (!isIndirect) { threadDimsVec = {threadDims[0], threadDims[1], threadDims[2]}; } size_t estimatedSizeRequired = estimateEncodeDispatchKernelCmdsSize(device, threadStartVec, threadDimsVec, isInternal, isCooperative, isIndirect, dispatchInterface); if (container.getCommandStream()->getAvailableSpace() < estimatedSizeRequired) { auto bbEnd = listCmdBufferStream->getSpaceForCmd(); *bbEnd = Family::cmdInitBatchBufferEnd; container.allocateNextCommandBuffer(); } WALKER_TYPE cmd = Family::cmdInitGpgpuWalker; auto idd = Family::cmdInitInterfaceDescriptorData; { auto alloc = dispatchInterface->getIsaAllocation(); UNRECOVERABLE_IF(nullptr == alloc); auto offset = alloc->getGpuAddressToPatch(); idd.setKernelStartPointer(offset); idd.setKernelStartPointerHigh(0u); } auto numThreadsPerThreadGroup = dispatchInterface->getNumThreadsPerThreadGroup(); idd.setNumberOfThreadsInGpgpuThreadGroup(numThreadsPerThreadGroup); EncodeDispatchKernel::programBarrierEnable(idd, kernelDescriptor.kernelAttributes.barrierCount, hwInfo); auto slmSize = static_cast( HwHelperHw::get().computeSlmValues(hwInfo, dispatchInterface->getSlmTotalSize())); idd.setSharedLocalMemorySize(slmSize); uint32_t bindingTableStateCount = kernelDescriptor.payloadMappings.bindingTable.numEntries; uint32_t bindingTablePointer = 0u; bool isBindlessKernel = kernelDescriptor.kernelAttributes.bufferAddressingMode == KernelDescriptor::BindlessAndStateless; if (!isBindlessKernel) { container.prepareBindfulSsh(); if (bindingTableStateCount > 0u) { auto ssh = container.getHeapWithRequiredSizeAndAlignment(HeapType::SURFACE_STATE, dispatchInterface->getSurfaceStateHeapDataSize(), BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE); sshOffset = ssh->getUsed(); bindingTablePointer = static_cast(EncodeSurfaceState::pushBindingTableAndSurfaceStates( *ssh, bindingTableStateCount, dispatchInterface->getSurfaceStateHeapData(), dispatchInterface->getSurfaceStateHeapDataSize(), bindingTableStateCount, kernelDescriptor.payloadMappings.bindingTable.tableOffset)); } } idd.setBindingTablePointer(bindingTablePointer); PreemptionHelper::programInterfaceDescriptorDataPreemption(&idd, preemptionMode); auto heap = ApiSpecificConfig::getBindlessConfiguration() ? device->getBindlessHeapsHelper()->getHeap(BindlessHeapsHelper::GLOBAL_DSH) : container.getIndirectHeap(HeapType::DYNAMIC_STATE); UNRECOVERABLE_IF(!heap); uint32_t samplerStateOffset = 0; uint32_t samplerCount = 0; if (kernelDescriptor.payloadMappings.samplerTable.numSamplers > 0) { samplerCount = kernelDescriptor.payloadMappings.samplerTable.numSamplers; samplerStateOffset = EncodeStates::copySamplerState(heap, kernelDescriptor.payloadMappings.samplerTable.tableOffset, kernelDescriptor.payloadMappings.samplerTable.numSamplers, kernelDescriptor.payloadMappings.samplerTable.borderColor, dispatchInterface->getDynamicStateHeapData(), device->getBindlessHeapsHelper(), hwInfo); } idd.setSamplerStatePointer(samplerStateOffset); if (!isBindlessKernel) { EncodeDispatchKernel::adjustBindingTablePrefetch(idd, samplerCount, bindingTableStateCount); } auto numGrfCrossThreadData = static_cast(sizeCrossThreadData / sizeof(float[8])); idd.setCrossThreadConstantDataReadLength(numGrfCrossThreadData); auto numGrfPerThreadData = static_cast(sizePerThreadData / sizeof(float[8])); DEBUG_BREAK_IF(numGrfPerThreadData <= 0u); idd.setConstantIndirectUrbEntryReadLength(numGrfPerThreadData); uint32_t sizeThreadData = sizePerThreadDataForWholeGroup + sizeCrossThreadData; uint32_t sizeForImplicitArgsPatching = dispatchInterface->getSizeForImplicitArgsPatching(); uint32_t iohRequiredSize = sizeThreadData + sizeForImplicitArgsPatching; uint64_t offsetThreadData = 0u; { auto heapIndirect = container.getIndirectHeap(HeapType::INDIRECT_OBJECT); UNRECOVERABLE_IF(!(heapIndirect)); heapIndirect->align(WALKER_TYPE::INDIRECTDATASTARTADDRESS_ALIGN_SIZE); auto ptr = container.getHeapSpaceAllowGrow(HeapType::INDIRECT_OBJECT, iohRequiredSize); UNRECOVERABLE_IF(!(ptr)); offsetThreadData = heapIndirect->getHeapGpuStartOffset() + static_cast(heapIndirect->getUsed() - sizeThreadData); if (pImplicitArgs) { offsetThreadData -= sizeof(ImplicitArgs); pImplicitArgs->localIdTablePtr = heapIndirect->getGraphicsAllocation()->getGpuAddress() + heapIndirect->getUsed() - iohRequiredSize; dispatchInterface->patchImplicitArgs(ptr); } memcpy_s(ptr, sizeCrossThreadData, dispatchInterface->getCrossThreadData(), sizeCrossThreadData); if (isIndirect) { auto gpuPtr = heapIndirect->getGraphicsAllocation()->getGpuAddress() + heapIndirect->getUsed() - sizeThreadData; uint64_t implicitArgsGpuPtr = 0u; if (pImplicitArgs) { implicitArgsGpuPtr = gpuPtr - sizeof(ImplicitArgs); } EncodeIndirectParams::encode(container, gpuPtr, dispatchInterface, implicitArgsGpuPtr); } ptr = ptrOffset(ptr, sizeCrossThreadData); memcpy_s(ptr, sizePerThreadDataForWholeGroup, dispatchInterface->getPerThreadData(), sizePerThreadDataForWholeGroup); } auto slmSizeNew = dispatchInterface->getSlmTotalSize(); bool dirtyHeaps = container.isAnyHeapDirty(); bool flush = container.slmSize != slmSizeNew || dirtyHeaps || requiresUncachedMocs; if (flush) { PipeControlArgs args; args.dcFlushEnable = MemorySynchronizationCommands::isDcFlushAllowed(true, hwInfo); if (dirtyHeaps) { args.hdcPipelineFlush = true; } MemorySynchronizationCommands::addPipeControl(*container.getCommandStream(), args); if (dirtyHeaps || requiresUncachedMocs) { STATE_BASE_ADDRESS sba; auto gmmHelper = container.getDevice()->getGmmHelper(); uint32_t statelessMocsIndex = requiresUncachedMocs ? (gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER_CACHELINE_MISALIGNED) >> 1) : (gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER) >> 1); EncodeStateBaseAddress::encode(container, sba, statelessMocsIndex, false); container.setDirtyStateForAllHeaps(false); requiresUncachedMocs = false; } if (container.slmSize != slmSizeNew) { EncodeL3State::encode(container, slmSizeNew != 0u); container.slmSize = slmSizeNew; if (container.nextIddInBlock != container.getNumIddPerBlock()) { EncodeMediaInterfaceDescriptorLoad::encode(container); } } } uint32_t numIDD = 0u; void *ptr = getInterfaceDescriptor(container, numIDD); memcpy_s(ptr, sizeof(idd), &idd, sizeof(idd)); cmd.setIndirectDataStartAddress(static_cast(offsetThreadData)); cmd.setIndirectDataLength(sizeThreadData); cmd.setInterfaceDescriptorOffset(numIDD); EncodeDispatchKernel::encodeThreadData(cmd, nullptr, threadDims, dispatchInterface->getGroupSize(), kernelDescriptor.kernelAttributes.simdSize, kernelDescriptor.kernelAttributes.numLocalIdChannels, dispatchInterface->getNumThreadsPerThreadGroup(), dispatchInterface->getThreadExecutionMask(), true, false, isIndirect, dispatchInterface->getRequiredWorkgroupOrder()); cmd.setPredicateEnable(isPredicate); if (ApiSpecificConfig::getBindlessConfiguration()) { container.getResidencyContainer().push_back(device->getBindlessHeapsHelper()->getHeap(NEO::BindlessHeapsHelper::BindlesHeapType::GLOBAL_DSH)->getGraphicsAllocation()); } EncodeDispatchKernel::adjustInterfaceDescriptorData(idd, hwInfo); PreemptionHelper::applyPreemptionWaCmdsBegin(listCmdBufferStream, *device); auto buffer = listCmdBufferStream->getSpace(sizeof(cmd)); *(decltype(cmd) *)buffer = cmd; PreemptionHelper::applyPreemptionWaCmdsEnd(listCmdBufferStream, *device); { auto mediaStateFlush = listCmdBufferStream->getSpace(sizeof(MEDIA_STATE_FLUSH)); *reinterpret_cast(mediaStateFlush) = Family::cmdInitMediaStateFlush; } partitionCount = 1; } template void EncodeMediaInterfaceDescriptorLoad::encode(CommandContainer &container) { using MEDIA_STATE_FLUSH = typename Family::MEDIA_STATE_FLUSH; using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename Family::MEDIA_INTERFACE_DESCRIPTOR_LOAD; auto heapBase = ApiSpecificConfig::getBindlessConfiguration() ? container.getDevice()->getBindlessHeapsHelper()->getHeap(BindlessHeapsHelper::GLOBAL_DSH)->getGraphicsAllocation()->getUnderlyingBuffer() : container.getIndirectHeap(HeapType::DYNAMIC_STATE)->getCpuBase(); auto mediaStateFlush = container.getCommandStream()->getSpaceForCmd(); *mediaStateFlush = Family::cmdInitMediaStateFlush; auto iddOffset = static_cast(ptrDiff(container.getIddBlock(), heapBase)); iddOffset += ApiSpecificConfig::getBindlessConfiguration() ? static_cast(container.getDevice()->getBindlessHeapsHelper()->getHeap(BindlessHeapsHelper::GLOBAL_DSH)->getGraphicsAllocation()->getGpuAddress() - container.getDevice()->getBindlessHeapsHelper()->getHeap(BindlessHeapsHelper::GLOBAL_DSH)->getGraphicsAllocation()->getGpuBaseAddress()) : 0; MEDIA_INTERFACE_DESCRIPTOR_LOAD cmd = Family::cmdInitMediaInterfaceDescriptorLoad; cmd.setInterfaceDescriptorDataStartAddress(iddOffset); cmd.setInterfaceDescriptorTotalLength(sizeof(INTERFACE_DESCRIPTOR_DATA) * container.getNumIddPerBlock()); auto buffer = container.getCommandStream()->getSpace(sizeof(cmd)); *(decltype(cmd) *)buffer = cmd; } template inline bool EncodeDispatchKernel::isRuntimeLocalIdsGenerationRequired(uint32_t activeChannels, size_t *lws, std::array walkOrder, bool requireInputWalkOrder, uint32_t &requiredWalkOrder, uint32_t simd) { requiredWalkOrder = 0u; return true; } template void EncodeDispatchKernel::encodeThreadData(WALKER_TYPE &walkerCmd, const uint32_t *startWorkGroup, const uint32_t *numWorkGroups, const uint32_t *workGroupSizes, uint32_t simd, uint32_t localIdDimensions, uint32_t threadsPerThreadGroup, uint32_t threadExecutionMask, bool localIdsGenerationByRuntime, bool inlineDataProgrammingRequired, bool isIndirect, uint32_t requiredWorkGroupOrder) { if (isIndirect) { walkerCmd.setIndirectParameterEnable(true); } else { walkerCmd.setThreadGroupIdXDimension(static_cast(numWorkGroups[0])); walkerCmd.setThreadGroupIdYDimension(static_cast(numWorkGroups[1])); walkerCmd.setThreadGroupIdZDimension(static_cast(numWorkGroups[2])); } if (startWorkGroup) { walkerCmd.setThreadGroupIdStartingX(static_cast(startWorkGroup[0])); walkerCmd.setThreadGroupIdStartingY(static_cast(startWorkGroup[1])); walkerCmd.setThreadGroupIdStartingResumeZ(static_cast(startWorkGroup[2])); } walkerCmd.setSimdSize(getSimdConfig(simd)); auto localWorkSize = workGroupSizes[0] * workGroupSizes[1] * workGroupSizes[2]; if (threadsPerThreadGroup == 0) { threadsPerThreadGroup = static_cast(getThreadsPerWG(simd, localWorkSize)); } walkerCmd.setThreadWidthCounterMaximum(threadsPerThreadGroup); uint64_t executionMask = threadExecutionMask; if (executionMask == 0) { auto remainderSimdLanes = localWorkSize & (simd - 1); executionMask = maxNBitValue(remainderSimdLanes); if (!executionMask) executionMask = ~executionMask; } constexpr uint32_t maxDword = std::numeric_limits::max(); walkerCmd.setRightExecutionMask(static_cast(executionMask)); walkerCmd.setBottomExecutionMask(maxDword); } template void EncodeDispatchKernel::programBarrierEnable(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t value, const HardwareInfo &hwInfo) { interfaceDescriptor.setBarrierEnable(value); } template inline void EncodeDispatchKernel::encodeAdditionalWalkerFields(const HardwareInfo &hwInfo, WALKER_TYPE &walkerCmd, KernelExecutionType kernelExecutionType) {} template void EncodeDispatchKernel::appendAdditionalIDDFields(INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, const HardwareInfo &hwInfo, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy) {} template size_t EncodeDispatchKernel::estimateEncodeDispatchKernelCmdsSize(Device *device, const Vec3 &groupStart, const Vec3 &groupCount, bool isInternal, bool isCooperative, bool isIndirect, DispatchKernelEncoderI *dispatchInterface) { using MEDIA_STATE_FLUSH = typename Family::MEDIA_STATE_FLUSH; using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename Family::MEDIA_INTERFACE_DESCRIPTOR_LOAD; using MI_BATCH_BUFFER_END = typename Family::MI_BATCH_BUFFER_END; size_t issueMediaInterfaceDescriptorLoad = sizeof(MEDIA_STATE_FLUSH) + sizeof(MEDIA_INTERFACE_DESCRIPTOR_LOAD); size_t totalSize = sizeof(WALKER_TYPE); totalSize += PreemptionHelper::getPreemptionWaCsSize(*device); totalSize += sizeof(MEDIA_STATE_FLUSH); totalSize += issueMediaInterfaceDescriptorLoad; totalSize += EncodeStates::getAdjustStateComputeModeSize(); totalSize += EncodeWA::getAdditionalPipelineSelectSize(*device); totalSize += EncodeIndirectParams::getCmdsSizeForIndirectParams(); totalSize += EncodeIndirectParams::getCmdsSizeForSetGroupCountIndirect(); totalSize += EncodeIndirectParams::getCmdsSizeForSetGroupSizeIndirect(); if (isIndirect) { UNRECOVERABLE_IF(dispatchInterface == nullptr); totalSize += EncodeIndirectParams::getCmdsSizeForSetWorkDimIndirect(dispatchInterface->getGroupSize(), false); if (dispatchInterface->getImplicitArgs()) { totalSize += EncodeIndirectParams::getCmdsSizeForSetGroupCountIndirect(); totalSize += EncodeIndirectParams::getCmdsSizeForSetGroupSizeIndirect(); totalSize += EncodeIndirectParams::getCmdsSizeForSetWorkDimIndirect(dispatchInterface->getGroupSize(), true); } } totalSize += sizeof(MI_BATCH_BUFFER_END); return totalSize; } template inline void EncodeComputeMode::programComputeModeCommand(LinearStream &csr, StateComputeModeProperties &properties, const HardwareInfo &hwInfo) { } template inline void EncodeComputeMode::adjustPipelineSelect(CommandContainer &container, const NEO::KernelDescriptor &kernelDescriptor) { } template void EncodeStateBaseAddress::setIohAddressForDebugger(NEO::Debugger::SbaAddresses &sbaAddress, const STATE_BASE_ADDRESS &sbaCmd) { sbaAddress.IndirectObjectBaseAddress = sbaCmd.getIndirectObjectBaseAddress(); } template void EncodeStateBaseAddress::encode(CommandContainer &container, STATE_BASE_ADDRESS &sbaCmd) { auto gmmHelper = container.getDevice()->getRootDeviceEnvironment().getGmmHelper(); uint32_t statelessMocsIndex = (gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER) >> 1); EncodeStateBaseAddress::encode(container, sbaCmd, statelessMocsIndex, false); } template void EncodeStateBaseAddress::encode(CommandContainer &container, STATE_BASE_ADDRESS &sbaCmd, uint32_t statelessMocsIndex, bool useGlobalAtomics) { if (container.isAnyHeapDirty()) { EncodeWA::encodeAdditionalPipelineSelect(*container.getDevice(), *container.getCommandStream(), true); } auto gmmHelper = container.getDevice()->getGmmHelper(); StateBaseAddressHelper::programStateBaseAddress( &sbaCmd, container.isHeapDirty(HeapType::DYNAMIC_STATE) ? container.getIndirectHeap(HeapType::DYNAMIC_STATE) : nullptr, container.isHeapDirty(HeapType::INDIRECT_OBJECT) ? container.getIndirectHeap(HeapType::INDIRECT_OBJECT) : nullptr, container.isHeapDirty(HeapType::SURFACE_STATE) ? container.getIndirectHeap(HeapType::SURFACE_STATE) : nullptr, 0, false, statelessMocsIndex, container.getIndirectObjectHeapBaseAddress(), container.getInstructionHeapBaseAddress(), 0, false, false, gmmHelper, false, MemoryCompressionState::NotApplicable, useGlobalAtomics, 1u); auto pCmd = reinterpret_cast(container.getCommandStream()->getSpace(sizeof(STATE_BASE_ADDRESS))); *pCmd = sbaCmd; EncodeWA::encodeAdditionalPipelineSelect(*container.getDevice(), *container.getCommandStream(), false); } template size_t EncodeStateBaseAddress::getRequiredSizeForStateBaseAddress(Device &device, CommandContainer &container) { return sizeof(typename Family::STATE_BASE_ADDRESS) + 2 * EncodeWA::getAdditionalPipelineSelectSize(device); } template void EncodeL3State::encode(CommandContainer &container, bool enableSLM) { auto offset = L3CNTLRegisterOffset::registerOffset; auto data = PreambleHelper::getL3Config(container.getDevice()->getHardwareInfo(), enableSLM); EncodeSetMMIO::encodeIMM(container, offset, data, false); } template void EncodeMiFlushDW::appendMiFlushDw(MI_FLUSH_DW *miFlushDwCmd, const HardwareInfo &hwInfo) {} template void EncodeMiFlushDW::programMiFlushDwWA(LinearStream &commandStream) {} template size_t EncodeMiFlushDW::getMiFlushDwWaSize() { return 0; } template inline void EncodeWA::encodeAdditionalPipelineSelect(Device &device, LinearStream &stream, bool is3DPipeline) {} template inline size_t EncodeWA::getAdditionalPipelineSelectSize(Device &device) { return 0; } template inline void EncodeSurfaceState::encodeExtraBufferParams(EncodeSurfaceStateArgs &args) { auto surfaceState = reinterpret_cast(args.outMemory); encodeExtraCacheSettings(surfaceState, *args.gmmHelper->getHardwareInfo()); } template bool EncodeSurfaceState::doBindingTablePrefetch() { return true; } template inline void EncodeSurfaceState::setCoherencyType(R_SURFACE_STATE *surfaceState, COHERENCY_TYPE coherencyType) { surfaceState->setCoherencyType(coherencyType); } template void EncodeSempahore::programMiSemaphoreWait(MI_SEMAPHORE_WAIT *cmd, uint64_t compareAddress, uint32_t compareData, COMPARE_OPERATION compareMode, bool registerPollMode) { MI_SEMAPHORE_WAIT localCmd = Family::cmdInitMiSemaphoreWait; localCmd.setCompareOperation(compareMode); localCmd.setSemaphoreDataDword(compareData); localCmd.setSemaphoreGraphicsAddress(compareAddress); localCmd.setWaitMode(MI_SEMAPHORE_WAIT::WAIT_MODE::WAIT_MODE_POLLING_MODE); *cmd = localCmd; } template void EncodeEnableRayTracing::programEnableRayTracing(LinearStream &commandStream, GraphicsAllocation &backBuffer) { } template inline void EncodeStoreMemory::programStoreDataImm(MI_STORE_DATA_IMM *cmdBuffer, uint64_t gpuAddress, uint32_t dataDword0, uint32_t dataDword1, bool storeQword, bool workloadPartitionOffset) { MI_STORE_DATA_IMM storeDataImmediate = Family::cmdInitStoreDataImm; storeDataImmediate.setAddress(gpuAddress); storeDataImmediate.setStoreQword(storeQword); storeDataImmediate.setDataDword0(dataDword0); if (storeQword) { storeDataImmediate.setDataDword1(dataDword1); storeDataImmediate.setDwordLength(MI_STORE_DATA_IMM::DWORD_LENGTH::DWORD_LENGTH_STORE_QWORD); } else { storeDataImmediate.setDwordLength(MI_STORE_DATA_IMM::DWORD_LENGTH::DWORD_LENGTH_STORE_DWORD); } *cmdBuffer = storeDataImmediate; } } // namespace NEO