/* * Copyright (C) 2020 Intel Corporation * * SPDX-License-Identifier: MIT * */ #pragma once #include "shared/source/command_container/command_encoder.h" #include "shared/source/command_stream/linear_stream.h" #include "shared/source/command_stream/preemption.h" #include "shared/source/execution_environment/execution_environment.h" #include "shared/source/gmm_helper/gmm_helper.h" #include "shared/source/helpers/simd_helper.h" #include "shared/source/helpers/state_base_address.h" #include "shared/source/kernel/dispatch_kernel_encoder_interface.h" #include "opencl/source/helpers/hardware_commands_helper.h" #include "pipe_control_args.h" #include namespace NEO { template void EncodeDispatchKernel::encode(CommandContainer &container, const void *pThreadGroupDimensions, bool isIndirect, bool isPredicate, DispatchKernelEncoderI *dispatchInterface, uint64_t eventAddress, Device *device, PreemptionMode preemptionMode) { using MEDIA_STATE_FLUSH = typename Family::MEDIA_STATE_FLUSH; using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename Family::MEDIA_INTERFACE_DESCRIPTOR_LOAD; using MI_BATCH_BUFFER_END = typename Family::MI_BATCH_BUFFER_END; auto &kernelDescriptor = dispatchInterface->getKernelDescriptor(); auto sizeCrossThreadData = dispatchInterface->getCrossThreadDataSize(); auto sizePerThreadData = dispatchInterface->getPerThreadDataSize(); auto sizePerThreadDataForWholeGroup = dispatchInterface->getPerThreadDataSizeForWholeThreadGroup(); LinearStream *listCmdBufferStream = container.getCommandStream(); size_t estimatedSizeRequired = estimateEncodeDispatchKernelCmdsSize(device); if (container.getCommandStream()->getAvailableSpace() < estimatedSizeRequired) { auto bbEnd = listCmdBufferStream->getSpaceForCmd(); *bbEnd = Family::cmdInitBatchBufferEnd; container.allocateNextCommandBuffer(); } WALKER_TYPE cmd = Family::cmdInitGpgpuWalker; auto idd = Family::cmdInitInterfaceDescriptorData; { auto alloc = dispatchInterface->getIsaAllocation(); UNRECOVERABLE_IF(nullptr == alloc); auto offset = alloc->getGpuAddressToPatch(); idd.setKernelStartPointer(offset); idd.setKernelStartPointerHigh(0u); } EncodeWA::encodeAdditionalPipelineSelect(*container.getDevice(), *container.getCommandStream(), true); EncodeStates::adjustStateComputeMode(*container.getCommandStream(), container.lastSentNumGrfRequired, nullptr, false, false); EncodeWA::encodeAdditionalPipelineSelect(*container.getDevice(), *container.getCommandStream(), false); auto numThreadsPerThreadGroup = dispatchInterface->getNumThreadsPerThreadGroup(); idd.setNumberOfThreadsInGpgpuThreadGroup(numThreadsPerThreadGroup); idd.setBarrierEnable(kernelDescriptor.kernelAttributes.flags.usesBarriers); idd.setSharedLocalMemorySize( dispatchInterface->getSlmTotalSize() > 0 ? static_cast(HardwareCommandsHelper::computeSlmValues(dispatchInterface->getSlmTotalSize())) : INTERFACE_DESCRIPTOR_DATA::SHARED_LOCAL_MEMORY_SIZE_ENCODES_0K); { uint32_t bindingTableStateCount = kernelDescriptor.payloadMappings.bindingTable.numEntries; uint32_t bindingTablePointer = 0u; if (bindingTableStateCount > 0u) { auto ssh = container.getHeapWithRequiredSizeAndAlignment(HeapType::SURFACE_STATE, dispatchInterface->getSurfaceStateHeapDataSize(), BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE); bindingTablePointer = static_cast(HardwareCommandsHelper::pushBindingTableAndSurfaceStates( *ssh, bindingTableStateCount, dispatchInterface->getSurfaceStateHeapData(), dispatchInterface->getSurfaceStateHeapDataSize(), bindingTableStateCount, kernelDescriptor.payloadMappings.bindingTable.tableOffset)); } idd.setBindingTablePointer(bindingTablePointer); uint32_t bindingTableStatePrefetchCount = 0; if (HardwareCommandsHelper::doBindingTablePrefetch()) { bindingTableStatePrefetchCount = std::min(31u, bindingTableStateCount); } idd.setBindingTableEntryCount(bindingTableStatePrefetchCount); } PreemptionHelper::programInterfaceDescriptorDataPreemption(&idd, preemptionMode); auto heap = container.getIndirectHeap(HeapType::DYNAMIC_STATE); UNRECOVERABLE_IF(!heap); uint32_t samplerStateOffset = 0; uint32_t samplerCount = 0; if (kernelDescriptor.payloadMappings.samplerTable.numSamplers > 0) { samplerCount = kernelDescriptor.payloadMappings.samplerTable.numSamplers; samplerStateOffset = EncodeStates::copySamplerState(heap, kernelDescriptor.payloadMappings.samplerTable.tableOffset, kernelDescriptor.payloadMappings.samplerTable.numSamplers, kernelDescriptor.payloadMappings.samplerTable.borderColor, dispatchInterface->getDynamicStateHeapData()); } idd.setSamplerStatePointer(samplerStateOffset); auto samplerCountState = static_cast((samplerCount + 3) / 4); idd.setSamplerCount(samplerCountState); auto numGrfCrossThreadData = static_cast(sizeCrossThreadData / sizeof(float[8])); DEBUG_BREAK_IF(numGrfCrossThreadData <= 0u); idd.setCrossThreadConstantDataReadLength(numGrfCrossThreadData); auto numGrfPerThreadData = static_cast(sizePerThreadData / sizeof(float[8])); DEBUG_BREAK_IF(numGrfPerThreadData <= 0u); idd.setConstantIndirectUrbEntryReadLength(numGrfPerThreadData); uint32_t sizeThreadData = sizePerThreadDataForWholeGroup + sizeCrossThreadData; uint64_t offsetThreadData = 0u; { auto heapIndirect = container.getIndirectHeap(HeapType::INDIRECT_OBJECT); UNRECOVERABLE_IF(!(heapIndirect)); heapIndirect->align(WALKER_TYPE::INDIRECTDATASTARTADDRESS_ALIGN_SIZE); auto ptr = container.getHeapSpaceAllowGrow(HeapType::INDIRECT_OBJECT, sizeThreadData); UNRECOVERABLE_IF(!(ptr)); offsetThreadData = heapIndirect->getHeapGpuStartOffset() + static_cast(heapIndirect->getUsed() - sizeThreadData); memcpy_s(ptr, sizeCrossThreadData, dispatchInterface->getCrossThreadData(), sizeCrossThreadData); if (isIndirect) { void *gpuPtr = reinterpret_cast(heapIndirect->getHeapGpuBase() + heapIndirect->getUsed() - sizeThreadData); EncodeIndirectParams::setGroupCountIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups, gpuPtr); EncodeIndirectParams::setGlobalWorkSizeIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize, gpuPtr, dispatchInterface->getGroupSize()); } ptr = ptrOffset(ptr, sizeCrossThreadData); memcpy_s(ptr, sizePerThreadDataForWholeGroup, dispatchInterface->getPerThreadData(), sizePerThreadDataForWholeGroup); } auto slmSizeNew = dispatchInterface->getSlmTotalSize(); bool flush = container.slmSize != slmSizeNew || container.isAnyHeapDirty(); if (flush) { PipeControlArgs args(true); MemorySynchronizationCommands::addPipeControl(*container.getCommandStream(), args); if (container.slmSize != slmSizeNew) { EncodeL3State::encode(container, slmSizeNew != 0u); container.slmSize = slmSizeNew; if (container.nextIddInBlock != container.getNumIddPerBlock()) { EncodeMediaInterfaceDescriptorLoad::encode(container); } } if (container.isAnyHeapDirty()) { EncodeStateBaseAddress::encode(container); container.setDirtyStateForAllHeaps(false); } } uint32_t numIDD = 0u; void *ptr = getInterfaceDescriptor(container, numIDD); memcpy_s(ptr, sizeof(idd), &idd, sizeof(idd)); cmd.setIndirectDataStartAddress(static_cast(offsetThreadData)); cmd.setIndirectDataLength(sizeThreadData); cmd.setInterfaceDescriptorOffset(numIDD); if (isIndirect) { cmd.setIndirectParameterEnable(true); } else { UNRECOVERABLE_IF(!pThreadGroupDimensions); auto threadDims = static_cast(pThreadGroupDimensions); cmd.setThreadGroupIdXDimension(threadDims[0]); cmd.setThreadGroupIdYDimension(threadDims[1]); cmd.setThreadGroupIdZDimension(threadDims[2]); } auto simdSize = kernelDescriptor.kernelAttributes.simdSize; auto simdSizeOp = getSimdConfig(simdSize); cmd.setSimdSize(simdSizeOp); cmd.setRightExecutionMask(dispatchInterface->getThreadExecutionMask()); cmd.setBottomExecutionMask(0xffffffff); cmd.setThreadWidthCounterMaximum(numThreadsPerThreadGroup); cmd.setPredicateEnable(isPredicate); PreemptionHelper::applyPreemptionWaCmdsBegin(listCmdBufferStream, *device); auto buffer = listCmdBufferStream->getSpace(sizeof(cmd)); *(decltype(cmd) *)buffer = cmd; PreemptionHelper::applyPreemptionWaCmdsEnd(listCmdBufferStream, *device); { auto mediaStateFlush = listCmdBufferStream->getSpace(sizeof(MEDIA_STATE_FLUSH)); *reinterpret_cast(mediaStateFlush) = Family::cmdInitMediaStateFlush; } } template void EncodeMediaInterfaceDescriptorLoad::encode(CommandContainer &container) { using MEDIA_STATE_FLUSH = typename Family::MEDIA_STATE_FLUSH; using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename Family::MEDIA_INTERFACE_DESCRIPTOR_LOAD; auto heap = container.getIndirectHeap(HeapType::DYNAMIC_STATE); auto mediaStateFlush = container.getCommandStream()->getSpaceForCmd(); *mediaStateFlush = Family::cmdInitMediaStateFlush; MEDIA_INTERFACE_DESCRIPTOR_LOAD cmd = Family::cmdInitMediaInterfaceDescriptorLoad; cmd.setInterfaceDescriptorDataStartAddress(static_cast(ptrDiff(container.getIddBlock(), heap->getCpuBase()))); cmd.setInterfaceDescriptorTotalLength(sizeof(INTERFACE_DESCRIPTOR_DATA) * container.getNumIddPerBlock()); auto buffer = container.getCommandStream()->getSpace(sizeof(cmd)); *(decltype(cmd) *)buffer = cmd; } template void EncodeStateBaseAddress::encode(CommandContainer &container) { EncodeWA::encodeAdditionalPipelineSelect(*container.getDevice(), *container.getCommandStream(), true); auto gmmHelper = container.getDevice()->getGmmHelper(); StateBaseAddressHelper::programStateBaseAddress( *container.getCommandStream(), container.isHeapDirty(HeapType::DYNAMIC_STATE) ? container.getIndirectHeap(HeapType::DYNAMIC_STATE) : nullptr, container.isHeapDirty(HeapType::INDIRECT_OBJECT) ? container.getIndirectHeap(HeapType::INDIRECT_OBJECT) : nullptr, container.isHeapDirty(HeapType::SURFACE_STATE) ? container.getIndirectHeap(HeapType::SURFACE_STATE) : nullptr, 0, false, (gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER) >> 1), container.getInstructionHeapBaseAddress(), false, gmmHelper, false); EncodeWA::encodeAdditionalPipelineSelect(*container.getDevice(), *container.getCommandStream(), false); } template void EncodeL3State::encode(CommandContainer &container, bool enableSLM) { auto offset = L3CNTLRegisterOffset::registerOffset; auto data = PreambleHelper::getL3Config(container.getDevice()->getHardwareInfo(), enableSLM); EncodeSetMMIO::encodeIMM(container, offset, data); } template size_t EncodeDispatchKernel::estimateEncodeDispatchKernelCmdsSize(Device *device) { using MEDIA_STATE_FLUSH = typename Family::MEDIA_STATE_FLUSH; using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename Family::MEDIA_INTERFACE_DESCRIPTOR_LOAD; using MI_BATCH_BUFFER_END = typename Family::MI_BATCH_BUFFER_END; size_t issueMediaInterfaceDescriptorLoad = sizeof(MEDIA_STATE_FLUSH) + sizeof(MEDIA_INTERFACE_DESCRIPTOR_LOAD); size_t totalSize = sizeof(WALKER_TYPE); totalSize += PreemptionHelper::getPreemptionWaCsSize(*device); totalSize += sizeof(MEDIA_STATE_FLUSH); totalSize += issueMediaInterfaceDescriptorLoad; totalSize += EncodeStates::getAdjustStateComputeModeSize(); totalSize += EncodeWA::getAdditionalPipelineSelectSize(*device); totalSize += EncodeIndirectParams::getCmdsSizeForIndirectParams(); totalSize += EncodeIndirectParams::getCmdsSizeForSetGroupCountIndirect(); totalSize += EncodeIndirectParams::getCmdsSizeForSetGroupSizeIndirect(); totalSize += sizeof(MI_BATCH_BUFFER_END); return totalSize; } template void EncodeMiFlushDW::appendMiFlushDw(MI_FLUSH_DW *miFlushDwCmd) {} template void EncodeMiFlushDW::programMiFlushDwWA(LinearStream &commandStream) {} template size_t EncodeMiFlushDW::getMiFlushDwWaSize() { return 0; } template void EncodeDispatchKernel::encodeAdditionalWalkerFields(const HardwareInfo &hwInfo, WALKER_TYPE &walkerCmd) {} template inline void EncodeWA::encodeAdditionalPipelineSelect(Device &device, LinearStream &stream, bool is3DPipeline) {} template inline size_t EncodeWA::getAdditionalPipelineSelectSize(Device &device) { return 0; } } // namespace NEO