/* * Copyright (C) 2020-2024 Intel Corporation * * SPDX-License-Identifier: MIT * */ #pragma once #include "shared/source/command_container/command_encoder.h" #include "shared/source/command_container/implicit_scaling.h" #include "shared/source/command_stream/linear_stream.h" #include "shared/source/debugger/debugger_l0.h" #include "shared/source/device/device.h" #include "shared/source/execution_environment/execution_environment.h" #include "shared/source/execution_environment/root_device_environment.h" #include "shared/source/gmm_helper/gmm_helper.h" #include "shared/source/helpers/api_specific_config.h" #include "shared/source/helpers/bindless_heaps_helper.h" #include "shared/source/helpers/blit_commands_helper.h" #include "shared/source/helpers/definitions/command_encoder_args.h" #include "shared/source/helpers/gfx_core_helper.h" #include "shared/source/helpers/hw_info.h" #include "shared/source/helpers/local_id_gen.h" #include "shared/source/helpers/preamble.h" #include "shared/source/helpers/register_offsets.h" #include "shared/source/helpers/simd_helper.h" #include "shared/source/helpers/string.h" #include "shared/source/image/image_surface_state.h" #include "shared/source/indirect_heap/indirect_heap.h" #include "shared/source/kernel/dispatch_kernel_encoder_interface.h" #include "shared/source/kernel/implicit_args_helper.h" #include "shared/source/kernel/kernel_descriptor.h" #include "shared/source/os_interface/product_helper.h" #include "shared/source/program/kernel_info.h" #include "encode_surface_state.inl" #include "encode_surface_state_args.h" #include namespace NEO { template uint32_t EncodeStates::copySamplerState(IndirectHeap *dsh, uint32_t samplerStateOffset, uint32_t samplerCount, uint32_t borderColorOffset, const void *fnDynamicStateHeap, BindlessHeapsHelper *bindlessHeapHelper, const RootDeviceEnvironment &rootDeviceEnvironment) { auto sizeSamplerState = sizeof(SAMPLER_STATE) * samplerCount; auto borderColorSize = samplerStateOffset - borderColorOffset; SAMPLER_STATE *dstSamplerState = nullptr; uint32_t samplerStateOffsetInDsh = 0; dsh->align(NEO::EncodeDispatchKernel::getDefaultDshAlignment()); uint32_t borderColorOffsetInDsh = 0; if (!bindlessHeapHelper || (!bindlessHeapHelper->isGlobalDshSupported())) { borderColorOffsetInDsh = static_cast(dsh->getUsed()); // add offset of graphics allocation base address relative to heap base address if (bindlessHeapHelper) { borderColorOffsetInDsh += static_cast(ptrDiff(dsh->getGpuBase(), bindlessHeapHelper->getGlobalHeapsBase())); } auto borderColor = dsh->getSpace(borderColorSize); memcpy_s(borderColor, borderColorSize, ptrOffset(fnDynamicStateHeap, borderColorOffset), borderColorSize); dsh->align(INTERFACE_DESCRIPTOR_DATA::SAMPLERSTATEPOINTER_ALIGN_SIZE); samplerStateOffsetInDsh = static_cast(dsh->getUsed()); dstSamplerState = reinterpret_cast(dsh->getSpace(sizeSamplerState)); } else { auto borderColor = reinterpret_cast(ptrOffset(fnDynamicStateHeap, borderColorOffset)); if (borderColor->getBorderColorRed() != 0.0f || borderColor->getBorderColorGreen() != 0.0f || borderColor->getBorderColorBlue() != 0.0f || (borderColor->getBorderColorAlpha() != 0.0f && borderColor->getBorderColorAlpha() != 1.0f)) { UNRECOVERABLE_IF(true); } else if (borderColor->getBorderColorAlpha() == 0.0f) { borderColorOffsetInDsh = bindlessHeapHelper->getDefaultBorderColorOffset(); } else { borderColorOffsetInDsh = bindlessHeapHelper->getAlphaBorderColorOffset(); } dsh->align(INTERFACE_DESCRIPTOR_DATA::SAMPLERSTATEPOINTER_ALIGN_SIZE); auto samplerStateInDsh = bindlessHeapHelper->allocateSSInHeap(sizeSamplerState, nullptr, BindlessHeapsHelper::BindlesHeapType::globalDsh); dstSamplerState = reinterpret_cast(samplerStateInDsh.ssPtr); samplerStateOffsetInDsh = static_cast(samplerStateInDsh.surfaceStateOffset); } auto &helper = rootDeviceEnvironment.getHelper(); auto &hwInfo = *rootDeviceEnvironment.getHardwareInfo(); auto srcSamplerState = reinterpret_cast(ptrOffset(fnDynamicStateHeap, samplerStateOffset)); SAMPLER_STATE state = {}; for (uint32_t i = 0; i < samplerCount; i++) { state = srcSamplerState[i]; state.setIndirectStatePointer(static_cast(borderColorOffsetInDsh)); helper.adjustSamplerState(&state, hwInfo); dstSamplerState[i] = state; } return samplerStateOffsetInDsh; } // namespace NEO template void EncodeMathMMIO::encodeMulRegVal(CommandContainer &container, uint32_t offset, uint32_t val, uint64_t dstAddress, bool isBcs) { int logLws = 0; int i = val; while (val >> logLws) { logLws++; } EncodeSetMMIO::encodeREG(container, RegisterOffsets::csGprR0, offset, isBcs); EncodeSetMMIO::encodeIMM(container, RegisterOffsets::csGprR1, 0, true, isBcs); i = 0; while (i < logLws) { if (val & (1 << i)) { EncodeMath::addition(container, AluRegisters::gpr1, AluRegisters::gpr0, AluRegisters::gpr2); EncodeSetMMIO::encodeREG(container, RegisterOffsets::csGprR1, RegisterOffsets::csGprR2, isBcs); } EncodeMath::addition(container, AluRegisters::gpr0, AluRegisters::gpr0, AluRegisters::gpr2); EncodeSetMMIO::encodeREG(container, RegisterOffsets::csGprR0, RegisterOffsets::csGprR2, isBcs); i++; } EncodeStoreMMIO::encode(*container.getCommandStream(), RegisterOffsets::csGprR1, dstAddress, false, nullptr, isBcs); } /* * Compute *firstOperand > secondOperand and store the result in * MI_PREDICATE_RESULT where firstOperand is an device memory address. * * To calculate the "greater than" operation in the device, * (secondOperand - *firstOperand) is used, and if the carry flag register is * set, then (*firstOperand) is greater than secondOperand. */ template void EncodeMathMMIO::encodeGreaterThanPredicate(CommandContainer &container, uint64_t firstOperand, uint32_t secondOperand, bool isBcs) { EncodeSetMMIO::encodeMEM(container, RegisterOffsets::csGprR0, firstOperand, isBcs); EncodeSetMMIO::encodeIMM(container, RegisterOffsets::csGprR1, secondOperand, true, isBcs); /* RegisterOffsets::csGprR* registers map to AluRegisters::gpr* registers */ EncodeMath::greaterThan(container, AluRegisters::gpr0, AluRegisters::gpr1, AluRegisters::gpr2); EncodeSetMMIO::encodeREG(container, RegisterOffsets::csPredicateResult, RegisterOffsets::csGprR2, isBcs); } /* * Compute bitwise AND between a register value from regOffset and immVal * and store it into dstAddress. */ template void EncodeMathMMIO::encodeBitwiseAndVal(CommandContainer &container, uint32_t regOffset, uint32_t immVal, uint64_t dstAddress, bool workloadPartition, void **outCmdBuffer, bool isBcs) { EncodeSetMMIO::encodeREG(container, RegisterOffsets::csGprR13, regOffset, isBcs); EncodeSetMMIO::encodeIMM(container, RegisterOffsets::csGprR14, immVal, true, isBcs); EncodeMath::bitwiseAnd(container, AluRegisters::gpr13, AluRegisters::gpr14, AluRegisters::gpr12); EncodeStoreMMIO::encode(*container.getCommandStream(), RegisterOffsets::csGprR12, dstAddress, workloadPartition, outCmdBuffer, isBcs); } /* * encodeAlu() performs operations that leave a state including the result of * an operation such as the carry flag, and the accu flag with subtraction and * addition result. * * Parameter "postOperationStateRegister" is the ALU register with the result * from the operation that the function caller is interested in obtaining. * * Parameter "finalResultRegister" is the final destination register where * data from "postOperationStateRegister" will be copied. */ template void EncodeMathMMIO::encodeAlu(MI_MATH_ALU_INST_INLINE *pAluParam, AluRegisters srcA, AluRegisters srcB, AluRegisters op, AluRegisters finalResultRegister, AluRegisters postOperationStateRegister) { MI_MATH_ALU_INST_INLINE aluParam; aluParam.DW0.Value = 0x0; aluParam.DW0.BitField.ALUOpcode = static_cast(AluRegisters::opcodeLoad); aluParam.DW0.BitField.Operand1 = static_cast(AluRegisters::srca); aluParam.DW0.BitField.Operand2 = static_cast(srcA); *pAluParam = aluParam; pAluParam++; aluParam.DW0.Value = 0x0; aluParam.DW0.BitField.ALUOpcode = static_cast(AluRegisters::opcodeLoad); aluParam.DW0.BitField.Operand1 = static_cast(AluRegisters::srcb); aluParam.DW0.BitField.Operand2 = static_cast(srcB); *pAluParam = aluParam; pAluParam++; /* Order of operation: Operand1 Operand2 */ aluParam.DW0.Value = 0x0; aluParam.DW0.BitField.ALUOpcode = static_cast(op); aluParam.DW0.BitField.Operand1 = 0; aluParam.DW0.BitField.Operand2 = 0; *pAluParam = aluParam; pAluParam++; aluParam.DW0.Value = 0x0; aluParam.DW0.BitField.ALUOpcode = static_cast(AluRegisters::opcodeStore); aluParam.DW0.BitField.Operand1 = static_cast(finalResultRegister); aluParam.DW0.BitField.Operand2 = static_cast(postOperationStateRegister); *pAluParam = aluParam; pAluParam++; } template uint32_t *EncodeMath::commandReserve(CommandContainer &container) { return commandReserve(*container.getCommandStream()); } template uint32_t *EncodeMath::commandReserve(LinearStream &cmdStream) { size_t size = sizeof(MI_MATH) + sizeof(MI_MATH_ALU_INST_INLINE) * RegisterConstants::numAluInstForReadModifyWrite; auto cmd = reinterpret_cast(cmdStream.getSpace(size)); MI_MATH mathBuffer; mathBuffer.DW0.Value = 0x0; mathBuffer.DW0.BitField.InstructionType = MI_MATH::COMMAND_TYPE_MI_COMMAND; mathBuffer.DW0.BitField.InstructionOpcode = MI_MATH::MI_COMMAND_OPCODE_MI_MATH; mathBuffer.DW0.BitField.DwordLength = RegisterConstants::numAluInstForReadModifyWrite - 1; *reinterpret_cast(cmd) = mathBuffer; cmd++; return cmd; } template void EncodeMathMMIO::encodeAluAdd(MI_MATH_ALU_INST_INLINE *pAluParam, AluRegisters firstOperandRegister, AluRegisters secondOperandRegister, AluRegisters finalResultRegister) { encodeAlu(pAluParam, firstOperandRegister, secondOperandRegister, AluRegisters::opcodeAdd, finalResultRegister, AluRegisters::accu); } template void EncodeMathMMIO::encodeAluSubStoreCarry(MI_MATH_ALU_INST_INLINE *pAluParam, AluRegisters regA, AluRegisters regB, AluRegisters finalResultRegister) { /* regB is subtracted from regA */ encodeAlu(pAluParam, regA, regB, AluRegisters::opcodeSub, finalResultRegister, AluRegisters::cf); } template void EncodeMathMMIO::encodeAluAnd(MI_MATH_ALU_INST_INLINE *pAluParam, AluRegisters firstOperandRegister, AluRegisters secondOperandRegister, AluRegisters finalResultRegister) { encodeAlu(pAluParam, firstOperandRegister, secondOperandRegister, AluRegisters::opcodeAnd, finalResultRegister, AluRegisters::accu); } template void EncodeMathMMIO::encodeIncrementOrDecrement(LinearStream &cmdStream, AluRegisters operandRegister, IncrementOrDecrementOperation operationType, bool isBcs) { LriHelper::program(&cmdStream, RegisterOffsets::csGprR7, 1, true, isBcs); LriHelper::program(&cmdStream, RegisterOffsets::csGprR7 + 4, 0, true, isBcs); EncodeAluHelper aluHelper; aluHelper.setNextAlu(AluRegisters::opcodeLoad, AluRegisters::srca, operandRegister); aluHelper.setNextAlu(AluRegisters::opcodeLoad, AluRegisters::srcb, AluRegisters::gpr7); aluHelper.setNextAlu((operationType == IncrementOrDecrementOperation::increment) ? AluRegisters::opcodeAdd : AluRegisters::opcodeSub); aluHelper.setNextAlu(AluRegisters::opcodeStore, operandRegister, AluRegisters::accu); aluHelper.copyToCmdStream(cmdStream); } template void EncodeMathMMIO::encodeIncrement(LinearStream &cmdStream, AluRegisters operandRegister, bool isBcs) { encodeIncrementOrDecrement(cmdStream, operandRegister, IncrementOrDecrementOperation::increment, isBcs); } template void EncodeMathMMIO::encodeDecrement(LinearStream &cmdStream, AluRegisters operandRegister, bool isBcs) { encodeIncrementOrDecrement(cmdStream, operandRegister, IncrementOrDecrementOperation::decrement, isBcs); } /* * greaterThan() tests if firstOperandRegister is greater than * secondOperandRegister. */ template void EncodeMath::greaterThan(CommandContainer &container, AluRegisters firstOperandRegister, AluRegisters secondOperandRegister, AluRegisters finalResultRegister) { uint32_t *cmd = EncodeMath::commandReserve(container); /* firstOperandRegister will be subtracted from secondOperandRegister */ EncodeMathMMIO::encodeAluSubStoreCarry(reinterpret_cast(cmd), secondOperandRegister, firstOperandRegister, finalResultRegister); } template void EncodeMath::addition(CommandContainer &container, AluRegisters firstOperandRegister, AluRegisters secondOperandRegister, AluRegisters finalResultRegister) { uint32_t *cmd = EncodeMath::commandReserve(container); EncodeMathMMIO::encodeAluAdd(reinterpret_cast(cmd), firstOperandRegister, secondOperandRegister, finalResultRegister); } template void EncodeMath::addition(LinearStream &cmdStream, AluRegisters firstOperandRegister, AluRegisters secondOperandRegister, AluRegisters finalResultRegister) { uint32_t *cmd = EncodeMath::commandReserve(cmdStream); EncodeMathMMIO::encodeAluAdd(reinterpret_cast(cmd), firstOperandRegister, secondOperandRegister, finalResultRegister); } template void EncodeMath::bitwiseAnd(CommandContainer &container, AluRegisters firstOperandRegister, AluRegisters secondOperandRegister, AluRegisters finalResultRegister) { uint32_t *cmd = EncodeMath::commandReserve(container); EncodeMathMMIO::encodeAluAnd(reinterpret_cast(cmd), firstOperandRegister, secondOperandRegister, finalResultRegister); } template inline void EncodeSetMMIO::encodeIMM(CommandContainer &container, uint32_t offset, uint32_t data, bool remap, bool isBcs) { EncodeSetMMIO::encodeIMM(*container.getCommandStream(), offset, data, remap, isBcs); } template inline void EncodeSetMMIO::encodeMEM(CommandContainer &container, uint32_t offset, uint64_t address, bool isBcs) { EncodeSetMMIO::encodeMEM(*container.getCommandStream(), offset, address, isBcs); } template inline void EncodeSetMMIO::encodeREG(CommandContainer &container, uint32_t dstOffset, uint32_t srcOffset, bool isBcs) { EncodeSetMMIO::encodeREG(*container.getCommandStream(), dstOffset, srcOffset, isBcs); } template inline void EncodeSetMMIO::encodeIMM(LinearStream &cmdStream, uint32_t offset, uint32_t data, bool remap, bool isBcs) { LriHelper::program(&cmdStream, offset, data, remap, isBcs); } template inline void EncodeStateBaseAddress::setSbaTrackingForL0DebuggerIfEnabled(bool trackingEnabled, Device &device, LinearStream &commandStream, STATE_BASE_ADDRESS &sbaCmd, bool useFirstLevelBB) { if (!trackingEnabled) { return; } NEO::Debugger::SbaAddresses sbaAddresses = {}; NEO::EncodeStateBaseAddress::setSbaAddressesForDebugger(sbaAddresses, sbaCmd); device.getL0Debugger()->captureStateBaseAddress(commandStream, sbaAddresses, useFirstLevelBB); } template void EncodeSetMMIO::encodeMEM(LinearStream &cmdStream, uint32_t offset, uint64_t address, bool isBcs) { MI_LOAD_REGISTER_MEM cmd = Family::cmdInitLoadRegisterMem; cmd.setRegisterAddress(offset); cmd.setMemoryAddress(address); remapOffset(&cmd); if (isBcs) { cmd.setRegisterAddress(offset + RegisterOffsets::bcs0Base); } auto buffer = cmdStream.getSpaceForCmd(); *buffer = cmd; } template void EncodeSetMMIO::encodeREG(LinearStream &cmdStream, uint32_t dstOffset, uint32_t srcOffset, bool isBcs) { MI_LOAD_REGISTER_REG cmd = Family::cmdInitLoadRegisterReg; cmd.setSourceRegisterAddress(srcOffset); cmd.setDestinationRegisterAddress(dstOffset); remapOffset(&cmd); if (isBcs) { cmd.setSourceRegisterAddress(srcOffset + RegisterOffsets::bcs0Base); cmd.setDestinationRegisterAddress(dstOffset + RegisterOffsets::bcs0Base); } auto buffer = cmdStream.getSpaceForCmd(); *buffer = cmd; } template void EncodeStoreMMIO::encode(LinearStream &csr, uint32_t offset, uint64_t address, bool workloadPartition, void **outCmdBuffer, bool isBcs) { auto buffer = csr.getSpaceForCmd(); if (outCmdBuffer != nullptr) { *outCmdBuffer = buffer; } EncodeStoreMMIO::encode(buffer, offset, address, workloadPartition, isBcs); } template inline void EncodeStoreMMIO::encode(MI_STORE_REGISTER_MEM *cmdBuffer, uint32_t offset, uint64_t address, bool workloadPartition, bool isBcs) { MI_STORE_REGISTER_MEM cmd = Family::cmdInitStoreRegisterMem; cmd.setRegisterAddress(offset); cmd.setMemoryAddress(address); appendFlags(&cmd, workloadPartition); if (isBcs) { cmd.setRegisterAddress(offset + RegisterOffsets::bcs0Base); } *cmdBuffer = cmd; } template void EncodeSurfaceState::encodeBuffer(EncodeSurfaceStateArgs &args) { auto surfaceState = reinterpret_cast(args.outMemory); uint64_t bufferSize = alignUp(args.size, getSurfaceBaseAddressAlignment()); bufferSize = std::min(bufferSize, static_cast(MemoryConstants::fullStatefulRegion - 1)); SurfaceStateBufferLength length = {0}; length.length = static_cast(bufferSize - 1); surfaceState->setWidth(length.surfaceState.width + 1); surfaceState->setHeight(length.surfaceState.height + 1); surfaceState->setDepth(length.surfaceState.depth + 1); surfaceState->setSurfaceType((args.graphicsAddress != 0) ? R_SURFACE_STATE::SURFACE_TYPE_SURFTYPE_BUFFER : R_SURFACE_STATE::SURFACE_TYPE_SURFTYPE_NULL); surfaceState->setSurfaceFormat(SURFACE_FORMAT::SURFACE_FORMAT_RAW); surfaceState->setSurfaceVerticalAlignment(R_SURFACE_STATE::SURFACE_VERTICAL_ALIGNMENT_VALIGN_4); surfaceState->setSurfaceHorizontalAlignment(R_SURFACE_STATE::SURFACE_HORIZONTAL_ALIGNMENT_HALIGN_DEFAULT); surfaceState->setTileMode(R_SURFACE_STATE::TILE_MODE_LINEAR); surfaceState->setVerticalLineStride(0); surfaceState->setVerticalLineStrideOffset(0); surfaceState->setMemoryObjectControlState(args.mocs); surfaceState->setSurfaceBaseAddress(args.graphicsAddress); surfaceState->setAuxiliarySurfaceMode(AUXILIARY_SURFACE_MODE::AUXILIARY_SURFACE_MODE_AUX_NONE); setCoherencyType(surfaceState, args.cpuCoherent ? R_SURFACE_STATE::COHERENCY_TYPE_IA_COHERENT : R_SURFACE_STATE::COHERENCY_TYPE_GPU_COHERENT); auto compressionEnabled = args.allocation ? args.allocation->isCompressionEnabled() : false; if (compressionEnabled && !args.forceNonAuxMode) { // Its expected to not program pitch/qpitch/baseAddress for Aux surface in CCS scenarios setCoherencyType(surfaceState, R_SURFACE_STATE::COHERENCY_TYPE_GPU_COHERENT); setBufferAuxParamsForCCS(surfaceState); } if (debugManager.flags.DisableCachingForStatefulBufferAccess.get()) { surfaceState->setMemoryObjectControlState(args.gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER_CACHELINE_MISALIGNED)); } EncodeSurfaceState::encodeExtraBufferParams(args); EncodeSurfaceState::appendBufferSurfaceState(args); } template void EncodeSurfaceState::getSshAlignedPointer(uintptr_t &ptr, size_t &offset) { auto sshAlignmentMask = getSurfaceBaseAddressAlignmentMask(); uintptr_t alignedPtr = ptr & sshAlignmentMask; offset = 0; if (ptr != alignedPtr) { offset = ptrDiff(ptr, alignedPtr); ptr = alignedPtr; } } // Returned binding table pointer is relative to given heap (which is assumed to be the Surface state base addess) // as required by the INTERFACE_DESCRIPTOR_DATA. template size_t EncodeSurfaceState::pushBindingTableAndSurfaceStates(IndirectHeap &dstHeap, const void *srcKernelSsh, size_t srcKernelSshSize, size_t numberOfBindingTableStates, size_t offsetOfBindingTable) { using BINDING_TABLE_STATE = typename Family::BINDING_TABLE_STATE; using INTERFACE_DESCRIPTOR_DATA = typename Family::INTERFACE_DESCRIPTOR_DATA; using RENDER_SURFACE_STATE = typename Family::RENDER_SURFACE_STATE; size_t sshSize = srcKernelSshSize; DEBUG_BREAK_IF(srcKernelSsh == nullptr); auto srcSurfaceState = srcKernelSsh; // Allocate space for new ssh data auto dstSurfaceState = dstHeap.getSpace(sshSize); // Compiler sends BTI table that is already populated with surface state pointers relative to local SSH. // We may need to patch these pointers so that they are relative to surface state base address if (dstSurfaceState == dstHeap.getCpuBase()) { // nothing to patch, we're at the start of heap (which is assumed to be the surface state base address) // we need to simply copy the ssh (including BTIs from compiler) memcpy_s(dstSurfaceState, sshSize, srcSurfaceState, sshSize); return offsetOfBindingTable; } // We can copy-over the surface states, but BTIs will need to be patched memcpy_s(dstSurfaceState, sshSize, srcSurfaceState, offsetOfBindingTable); uint32_t surfaceStatesOffset = static_cast(ptrDiff(dstSurfaceState, dstHeap.getCpuBase())); // march over BTIs and offset the pointers based on surface state base address auto *dstBtiTableBase = reinterpret_cast(ptrOffset(dstSurfaceState, offsetOfBindingTable)); DEBUG_BREAK_IF(reinterpret_cast(dstBtiTableBase) % INTERFACE_DESCRIPTOR_DATA::BINDINGTABLEPOINTER_ALIGN_SIZE != 0); auto *srcBtiTableBase = reinterpret_cast(ptrOffset(srcSurfaceState, offsetOfBindingTable)); BINDING_TABLE_STATE bti = Family::cmdInitBindingTableState; for (uint32_t i = 0, e = static_cast(numberOfBindingTableStates); i != e; ++i) { uint32_t localSurfaceStateOffset = srcBtiTableBase[i].getSurfaceStatePointer(); uint32_t offsetedSurfaceStateOffset = localSurfaceStateOffset + surfaceStatesOffset; bti.setSurfaceStatePointer(offsetedSurfaceStateOffset); // patch just the SurfaceStatePointer bits dstBtiTableBase[i] = bti; DEBUG_BREAK_IF(bti.getRawData(0) % sizeof(BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE) != 0); } return ptrDiff(dstBtiTableBase, dstHeap.getCpuBase()); } template inline void EncodeSurfaceState::encodeExtraCacheSettings(R_SURFACE_STATE *surfaceState, const EncodeSurfaceStateArgs &args) {} template void EncodeSurfaceState::setImageAuxParamsForCCS(R_SURFACE_STATE *surfaceState, Gmm *gmm) { using AUXILIARY_SURFACE_MODE = typename Family::RENDER_SURFACE_STATE::AUXILIARY_SURFACE_MODE; // Its expected to not program pitch/qpitch/baseAddress for Aux surface in CCS scenarios surfaceState->setAuxiliarySurfaceMode(AUXILIARY_SURFACE_MODE::AUXILIARY_SURFACE_MODE_AUX_CCS_E); setFlagsForMediaCompression(surfaceState, gmm); setClearColorParams(surfaceState, gmm); setUnifiedAuxBaseAddress(surfaceState, gmm); } template void EncodeSurfaceState::setBufferAuxParamsForCCS(R_SURFACE_STATE *surfaceState) { using AUXILIARY_SURFACE_MODE = typename R_SURFACE_STATE::AUXILIARY_SURFACE_MODE; surfaceState->setAuxiliarySurfaceMode(AUXILIARY_SURFACE_MODE::AUXILIARY_SURFACE_MODE_AUX_CCS_E); } template bool EncodeSurfaceState::isAuxModeEnabled(R_SURFACE_STATE *surfaceState, Gmm *gmm) { using AUXILIARY_SURFACE_MODE = typename R_SURFACE_STATE::AUXILIARY_SURFACE_MODE; return (surfaceState->getAuxiliarySurfaceMode() == AUXILIARY_SURFACE_MODE::AUXILIARY_SURFACE_MODE_AUX_CCS_E); } template void EncodeSurfaceState::appendParamsForImageFromBuffer(R_SURFACE_STATE *surfaceState) { } template inline void EncodeDispatchKernel::encodeCommon(CommandContainer &container, EncodeDispatchKernelArgs &args) { using DefaultWalkerType = typename Family::DefaultWalkerType; EncodeDispatchKernel::template encode(container, args); } template void *EncodeDispatchKernel::getInterfaceDescriptor(CommandContainer &container, IndirectHeap *childDsh, uint32_t &iddOffset) { if (container.nextIddInBlockRef() == container.getNumIddPerBlock()) { void *heapPointer = nullptr; size_t heapSize = sizeof(INTERFACE_DESCRIPTOR_DATA) * container.getNumIddPerBlock(); if (childDsh != nullptr) { childDsh->align(NEO::EncodeDispatchKernel::getDefaultDshAlignment()); heapPointer = childDsh->getSpace(heapSize); } else { container.getIndirectHeap(HeapType::dynamicState)->align(NEO::EncodeDispatchKernel::getDefaultDshAlignment()); heapPointer = container.getHeapSpaceAllowGrow(HeapType::dynamicState, heapSize); } container.setIddBlock(heapPointer); container.nextIddInBlockRef() = 0; } iddOffset = container.nextIddInBlockRef(); auto interfaceDescriptorData = static_cast(container.getIddBlock()); container.nextIddInBlockRef()++; return &interfaceDescriptorData[iddOffset]; } template bool EncodeDispatchKernel::inlineDataProgrammingRequired(const KernelDescriptor &kernelDesc) { auto checkKernelForInlineData = true; if (debugManager.flags.EnablePassInlineData.get() != -1) { checkKernelForInlineData = !!debugManager.flags.EnablePassInlineData.get(); } if (checkKernelForInlineData) { return kernelDesc.kernelAttributes.flags.passInlineData; } return false; } template template void EncodeDispatchKernel::encodeEuSchedulingPolicy(InterfaceDescriptorType *pInterfaceDescriptor, const KernelDescriptor &kernelDesc, int32_t defaultPipelinedThreadArbitrationPolicy) { } template template void EncodeDispatchKernel::adjustTimestampPacket(WalkerType &walkerCmd, const EncodeDispatchKernelArgs &args) {} template template void EncodeDispatchKernel::setWalkerRegionSettings(WalkerType &walkerCmd, const HardwareInfo &hwInfo, uint32_t partitionCount, uint32_t workgroupSize, uint32_t maxWgCountPerTile, bool requiredWalkOrder) {} template void EncodeIndirectParams::encode(CommandContainer &container, uint64_t crossThreadDataGpuVa, DispatchKernelEncoderI *dispatchInterface, uint64_t implicitArgsGpuPtr) { const auto &kernelDescriptor = dispatchInterface->getKernelDescriptor(); setGroupCountIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups, crossThreadDataGpuVa); setGlobalWorkSizeIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize, crossThreadDataGpuVa, dispatchInterface->getGroupSize()); UNRECOVERABLE_IF(NEO::isValidOffset(kernelDescriptor.payloadMappings.dispatchTraits.workDim) && (kernelDescriptor.payloadMappings.dispatchTraits.workDim & 0b11) != 0u); setWorkDimIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.workDim, crossThreadDataGpuVa, dispatchInterface->getGroupSize()); if (implicitArgsGpuPtr) { CrossThreadDataOffset groupCountOffset[] = {offsetof(ImplicitArgs, groupCountX), offsetof(ImplicitArgs, groupCountY), offsetof(ImplicitArgs, groupCountZ)}; CrossThreadDataOffset globalSizeOffset[] = {offsetof(ImplicitArgs, globalSizeX), offsetof(ImplicitArgs, globalSizeY), offsetof(ImplicitArgs, globalSizeZ)}; setGroupCountIndirect(container, groupCountOffset, implicitArgsGpuPtr); setGlobalWorkSizeIndirect(container, globalSizeOffset, implicitArgsGpuPtr, dispatchInterface->getGroupSize()); setWorkDimIndirect(container, offsetof(ImplicitArgs, numWorkDim), implicitArgsGpuPtr, dispatchInterface->getGroupSize()); } } template void EncodeIndirectParams::setGroupCountIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress) { for (int i = 0; i < 3; ++i) { if (NEO::isUndefinedOffset(offsets[i])) { continue; } EncodeStoreMMIO::encode(*container.getCommandStream(), RegisterOffsets::gpgpuDispatchDim[i], ptrOffset(crossThreadAddress, offsets[i]), false, nullptr, false); } } template void EncodeIndirectParams::setWorkDimIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset workDimOffset, uint64_t crossThreadAddress, const uint32_t *groupSize) { if (NEO::isValidOffset(workDimOffset)) { auto dstPtr = ptrOffset(crossThreadAddress, workDimOffset); constexpr uint32_t resultRegister = RegisterOffsets::csGprR0; constexpr AluRegisters resultAluRegister = AluRegisters::gpr0; const uint32_t offset = static_cast((1ull << 8 * (dstPtr & 0b11)) - 1); const uint32_t memoryMask = std::numeric_limits::max() - static_cast((1ull << 8 * ((dstPtr & 0b11) + 1)) - 1) + offset; /* * if ( groupSize[2] > 1 || groupCount[2] > 1 ) { workdim = 3 } * else if ( groupSize[1] + groupCount[1] > 2 ) { workdim = 2 } * else { workdim = 1 } */ if (groupSize[2] > 1) { EncodeSetMMIO::encodeIMM(container, resultRegister, 3 << (8 * (dstPtr & 0b11)), true, false); } else { constexpr uint32_t groupCount2Register = RegisterOffsets::csGprR1; constexpr AluRegisters groupCount2AluRegister = AluRegisters::gpr1; constexpr uint32_t groupSize1Register = RegisterOffsets::csGprR0; constexpr AluRegisters groupSize1AluRegister = AluRegisters::gpr0; constexpr uint32_t groupCount1Register = RegisterOffsets::csGprR1; constexpr AluRegisters groupCount1AluRegister = AluRegisters::gpr1; constexpr AluRegisters sumAluRegister = AluRegisters::gpr0; constexpr AluRegisters workDimEq3AluRegister = AluRegisters::gpr3; constexpr AluRegisters workDimGe2AluRegister = AluRegisters::gpr4; constexpr uint32_t constantOneRegister = RegisterOffsets::csGprR5; constexpr AluRegisters constantOneAluRegister = AluRegisters::gpr5; constexpr uint32_t constantTwoRegister = RegisterOffsets::csGprR6; constexpr AluRegisters constantTwoAluRegister = AluRegisters::gpr6; constexpr uint32_t backupRegister = RegisterOffsets::csGprR7; constexpr AluRegisters backupAluRegister = AluRegisters::gpr7; constexpr uint32_t memoryMaskRegister = RegisterOffsets::csGprR8; constexpr AluRegisters memoryMaskAluRegister = AluRegisters::gpr8; constexpr uint32_t offsetRegister = RegisterOffsets::csGprR8; constexpr AluRegisters offsetAluRegister = AluRegisters::gpr8; if (offset) { EncodeSetMMIO::encodeMEM(container, backupRegister, dstPtr, false); EncodeSetMMIO::encodeIMM(container, memoryMaskRegister, memoryMask, true, false); EncodeMath::bitwiseAnd(container, memoryMaskAluRegister, backupAluRegister, backupAluRegister); EncodeSetMMIO::encodeIMM(container, offsetRegister, offset, true, false); } EncodeSetMMIO::encodeIMM(container, constantOneRegister, 1, true, false); EncodeSetMMIO::encodeIMM(container, constantTwoRegister, 2, true, false); EncodeSetMMIO::encodeREG(container, groupCount2Register, RegisterOffsets::gpgpuDispatchDim[2], false); EncodeMath::greaterThan(container, groupCount2AluRegister, constantOneAluRegister, workDimEq3AluRegister); EncodeMath::bitwiseAnd(container, workDimEq3AluRegister, constantOneAluRegister, workDimEq3AluRegister); EncodeSetMMIO::encodeIMM(container, groupSize1Register, groupSize[1], true, false); EncodeSetMMIO::encodeREG(container, groupCount1Register, RegisterOffsets::gpgpuDispatchDim[1], false); EncodeMath::addition(container, groupSize1AluRegister, groupCount1AluRegister, sumAluRegister); EncodeMath::addition(container, sumAluRegister, workDimEq3AluRegister, sumAluRegister); EncodeMath::greaterThan(container, sumAluRegister, constantTwoAluRegister, workDimGe2AluRegister); EncodeMath::bitwiseAnd(container, workDimGe2AluRegister, constantOneAluRegister, workDimGe2AluRegister); if (offset) { EncodeMath::addition(container, constantOneAluRegister, offsetAluRegister, constantOneAluRegister); EncodeMath::addition(container, workDimEq3AluRegister, offsetAluRegister, workDimEq3AluRegister); EncodeMath::bitwiseAnd(container, workDimEq3AluRegister, constantOneAluRegister, workDimEq3AluRegister); EncodeMath::addition(container, workDimGe2AluRegister, offsetAluRegister, workDimGe2AluRegister); EncodeMath::bitwiseAnd(container, workDimGe2AluRegister, constantOneAluRegister, workDimGe2AluRegister); } EncodeSetMMIO::encodeREG(container, resultRegister, constantOneRegister, false); EncodeMath::addition(container, resultAluRegister, workDimGe2AluRegister, resultAluRegister); EncodeMath::addition(container, resultAluRegister, workDimEq3AluRegister, resultAluRegister); if (offset) { EncodeMath::addition(container, resultAluRegister, backupAluRegister, resultAluRegister); } } EncodeStoreMMIO::encode(*container.getCommandStream(), resultRegister, dstPtr, false, nullptr, false); } } template bool EncodeSurfaceState::doBindingTablePrefetch() { auto enableBindingTablePrefetech = isBindingTablePrefetchPreferred(); if (debugManager.flags.ForceBtpPrefetchMode.get() != -1) { enableBindingTablePrefetech = static_cast(debugManager.flags.ForceBtpPrefetchMode.get()); } return enableBindingTablePrefetech; } template void EncodeSurfaceState::setPitchForScratch(R_SURFACE_STATE *surfaceState, uint32_t pitch, const ProductHelper &productHelper) { surfaceState->setSurfacePitch(pitch); } template uint32_t EncodeSurfaceState::getPitchForScratchInBytes(R_SURFACE_STATE *surfaceState, const ProductHelper &productHelper) { return surfaceState->getSurfacePitch(); } template void EncodeDispatchKernel::adjustBindingTablePrefetch(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t samplerCount, uint32_t bindingTableEntryCount) { auto enablePrefetch = EncodeSurfaceState::doBindingTablePrefetch(); if (enablePrefetch) { interfaceDescriptor.setSamplerCount(static_cast((samplerCount + 3) / 4)); interfaceDescriptor.setBindingTableEntryCount(std::min(bindingTableEntryCount, 31u)); } else { interfaceDescriptor.setSamplerCount(INTERFACE_DESCRIPTOR_DATA::SAMPLER_COUNT::SAMPLER_COUNT_NO_SAMPLERS_USED); interfaceDescriptor.setBindingTableEntryCount(0u); } } template template void EncodeDispatchKernel::adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t grfCount, WalkerType &walkerCmd) {} template size_t EncodeDispatchKernel::getSizeRequiredDsh(const KernelDescriptor &kernelDescriptor, uint32_t iddCount) { using INTERFACE_DESCRIPTOR_DATA = typename Family::INTERFACE_DESCRIPTOR_DATA; constexpr auto samplerStateSize = sizeof(typename Family::SAMPLER_STATE); const auto numSamplers = kernelDescriptor.payloadMappings.samplerTable.numSamplers; const auto additionalDshSize = additionalSizeRequiredDsh(iddCount); if (numSamplers == 0U) { return alignUp(additionalDshSize, EncodeDispatchKernel::getDefaultDshAlignment()); } size_t size = kernelDescriptor.payloadMappings.samplerTable.tableOffset - kernelDescriptor.payloadMappings.samplerTable.borderColor; size = alignUp(size, EncodeDispatchKernel::getDefaultDshAlignment()); size += numSamplers * samplerStateSize; size = alignUp(size, INTERFACE_DESCRIPTOR_DATA::SAMPLERSTATEPOINTER_ALIGN_SIZE); if (additionalDshSize > 0) { size = alignUp(size, EncodeStates::alignInterfaceDescriptorData); size += additionalDshSize; size = alignUp(size, EncodeDispatchKernel::getDefaultDshAlignment()); } return size; } template template void EncodeDispatchKernel::adjustInterfaceDescriptorDataForOverdispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t grfCount, WalkerType &walkerCmd) { const auto &productHelper = device.getProductHelper(); if (productHelper.isDisableOverdispatchAvailable(hwInfo)) { interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1); bool adjustTGDispatchSize = true; if (debugManager.flags.AdjustThreadGroupDispatchSize.get() != -1) { adjustTGDispatchSize = !!debugManager.flags.AdjustThreadGroupDispatchSize.get(); } // apply v2 algorithm only for parts where MaxSubSlicesSupported is equal to SubSliceCount auto algorithmVersion = hwInfo.gtSystemInfo.MaxSubSlicesSupported == hwInfo.gtSystemInfo.SubSliceCount ? 2 : 1; if (debugManager.flags.ForceThreadGroupDispatchSizeAlgorithm.get() != -1) { algorithmVersion = debugManager.flags.ForceThreadGroupDispatchSizeAlgorithm.get(); } if (algorithmVersion == 2) { auto threadsPerXeCore = hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.MaxSubSlicesSupported; if (grfCount == 256) { threadsPerXeCore /= 2; } auto tgDispatchSizeSelected = 8; uint32_t numberOfThreadsInThreadGroup = interfaceDescriptor.getNumberOfThreadsInGpgpuThreadGroup(); if (walkerCmd.getThreadGroupIdXDimension() > 1 && (walkerCmd.getThreadGroupIdYDimension() > 1 || walkerCmd.getThreadGroupIdZDimension() > 1)) { while (walkerCmd.getThreadGroupIdXDimension() % tgDispatchSizeSelected != 0) { tgDispatchSizeSelected /= 2; } } else if (walkerCmd.getThreadGroupIdYDimension() > 1 && walkerCmd.getThreadGroupIdZDimension() > 1) { while (walkerCmd.getThreadGroupIdYDimension() % tgDispatchSizeSelected != 0) { tgDispatchSizeSelected /= 2; } } auto workgroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension(); auto tileCount = ImplicitScalingHelper::isImplicitScalingEnabled(device.getDeviceBitfield(), true) ? device.getNumSubDevices() : 1u; // make sure we fit all xe core while (workgroupCount / tgDispatchSizeSelected < hwInfo.gtSystemInfo.MaxSubSlicesSupported * tileCount && tgDispatchSizeSelected > 1) { tgDispatchSizeSelected /= 2; } auto threadCountPerGrouping = tgDispatchSizeSelected * numberOfThreadsInThreadGroup; // make sure we do not use more threads then present on each xe core while (threadCountPerGrouping > threadsPerXeCore && tgDispatchSizeSelected > 1) { tgDispatchSizeSelected /= 2; threadCountPerGrouping /= 2; } if (tgDispatchSizeSelected == 8) { interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8); } else if (tgDispatchSizeSelected == 1) { interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1); } else if (tgDispatchSizeSelected == 2) { interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2); } else { interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4); } } else { if (adjustTGDispatchSize) { UNRECOVERABLE_IF(grfCount == 0u); constexpr uint32_t maxThreadsInTGForTGDispatchSize8 = 16u; constexpr uint32_t maxThreadsInTGForTGDispatchSize4 = 32u; auto &gfxCoreHelper = device.getGfxCoreHelper(); uint32_t availableThreadCount = gfxCoreHelper.calculateAvailableThreadCount(hwInfo, grfCount); if (ImplicitScalingHelper::isImplicitScalingEnabled(device.getDeviceBitfield(), true)) { const uint32_t tilesCount = device.getNumSubDevices(); availableThreadCount *= tilesCount; } uint32_t numberOfThreadsInThreadGroup = interfaceDescriptor.getNumberOfThreadsInGpgpuThreadGroup(); uint32_t dispatchedTotalThreadCount = numberOfThreadsInThreadGroup * threadGroupCount; UNRECOVERABLE_IF(numberOfThreadsInThreadGroup == 0u); auto tgDispatchSizeSelected = 1u; if (dispatchedTotalThreadCount <= availableThreadCount) { tgDispatchSizeSelected = 1; } else if (numberOfThreadsInThreadGroup <= maxThreadsInTGForTGDispatchSize8) { tgDispatchSizeSelected = 8; } else if (numberOfThreadsInThreadGroup <= maxThreadsInTGForTGDispatchSize4) { tgDispatchSizeSelected = 4; } else { tgDispatchSizeSelected = 2; } if (walkerCmd.getThreadGroupIdXDimension() > 1 && (walkerCmd.getThreadGroupIdYDimension() > 1 || walkerCmd.getThreadGroupIdZDimension() > 1)) { while (walkerCmd.getThreadGroupIdXDimension() % tgDispatchSizeSelected != 0) { tgDispatchSizeSelected /= 2; } } else if (walkerCmd.getThreadGroupIdYDimension() > 1 && walkerCmd.getThreadGroupIdZDimension() > 1) { while (walkerCmd.getThreadGroupIdYDimension() % tgDispatchSizeSelected != 0) { tgDispatchSizeSelected /= 2; } } if (tgDispatchSizeSelected == 8) { interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8); } else if (tgDispatchSizeSelected == 1) { interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1); } else if (tgDispatchSizeSelected == 2) { interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2); } else { interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4); } } } } if (debugManager.flags.ForceThreadGroupDispatchSize.get() != -1) { interfaceDescriptor.setThreadGroupDispatchSize(static_cast( debugManager.flags.ForceThreadGroupDispatchSize.get())); } } template size_t EncodeDispatchKernel::getSizeRequiredSsh(const KernelInfo &kernelInfo) { size_t requiredSshSize = kernelInfo.heapInfo.surfaceStateHeapSize; bool isBindlessKernel = NEO::KernelDescriptor ::isBindlessAddressingKernel(kernelInfo.kernelDescriptor); if (isBindlessKernel) { requiredSshSize = std::max(requiredSshSize, kernelInfo.kernelDescriptor.kernelAttributes.numArgsStateful * sizeof(typename Family::RENDER_SURFACE_STATE)); } requiredSshSize = alignUp(requiredSshSize, EncodeDispatchKernel::getDefaultSshAlignment()); return requiredSshSize; } template size_t EncodeDispatchKernel::getDefaultDshAlignment() { return Family::cacheLineSize; } template size_t EncodeDispatchKernel::getScratchPtrOffsetOfImplicitArgs() { return 0; } template template void EncodeDispatchKernel::setScratchAddress(uint64_t &scratchAddress, uint32_t requiredScratchSlot0Size, uint32_t requiredScratchSlot1Size, IndirectHeap *ssh, CommandStreamReceiver &submissionCsr) { } template void EncodeIndirectParams::setGlobalWorkSizeIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress, const uint32_t *lws) { for (int i = 0; i < 3; ++i) { if (NEO::isUndefinedOffset(offsets[i])) { continue; } EncodeMathMMIO::encodeMulRegVal(container, RegisterOffsets::gpgpuDispatchDim[i], lws[i], ptrOffset(crossThreadAddress, offsets[i]), false); } } template inline size_t EncodeIndirectParams::getCmdsSizeForSetWorkDimIndirect(const uint32_t *groupSize, bool misaligedPtr) { constexpr uint32_t aluCmdSize = sizeof(MI_MATH) + sizeof(MI_MATH_ALU_INST_INLINE) * RegisterConstants::numAluInstForReadModifyWrite; auto requiredSize = sizeof(MI_STORE_REGISTER_MEM) + sizeof(MI_LOAD_REGISTER_IMM); UNRECOVERABLE_IF(!groupSize); if (groupSize[2] < 2) { requiredSize += 2 * sizeof(MI_LOAD_REGISTER_IMM) + 3 * sizeof(MI_LOAD_REGISTER_REG) + 8 * aluCmdSize; if (misaligedPtr) { requiredSize += 2 * sizeof(MI_LOAD_REGISTER_IMM) + sizeof(MI_LOAD_REGISTER_MEM) + 7 * aluCmdSize; } } return requiredSize; } template void EncodeSemaphore::appendSemaphoreCommand(MI_SEMAPHORE_WAIT &cmd, uint64_t compareData, bool indirect, bool useQwordData, bool switchOnUnsuccessful) { constexpr uint64_t upper32b = static_cast(std::numeric_limits::max()) << 32; UNRECOVERABLE_IF(useQwordData || (compareData & upper32b)); } template void EncodeSemaphore::addMiSemaphoreWaitCommand(LinearStream &commandStream, uint64_t compareAddress, uint64_t compareData, COMPARE_OPERATION compareMode, bool registerPollMode, bool useQwordData, bool indirect, bool switchOnUnsuccessful, void **outSemWaitCmd) { auto semaphoreCommand = commandStream.getSpaceForCmd(); if (outSemWaitCmd != nullptr) { *outSemWaitCmd = semaphoreCommand; } programMiSemaphoreWait(semaphoreCommand, compareAddress, compareData, compareMode, registerPollMode, true, useQwordData, indirect, switchOnUnsuccessful); } template void EncodeSemaphore::applyMiSemaphoreWaitCommand(LinearStream &commandStream, std::list &commandsList) { MI_SEMAPHORE_WAIT *semaphoreCommand = commandStream.getSpaceForCmd(); commandsList.push_back(semaphoreCommand); } template inline void EncodeAtomic::setMiAtomicAddress(MI_ATOMIC &atomic, uint64_t writeAddress) { atomic.setMemoryAddress(static_cast(writeAddress & 0x0000FFFFFFFFULL)); atomic.setMemoryAddressHigh(static_cast(writeAddress >> 32)); } template void EncodeAtomic::programMiAtomic(MI_ATOMIC *atomic, uint64_t writeAddress, ATOMIC_OPCODES opcode, DATA_SIZE dataSize, uint32_t returnDataControl, uint32_t csStall, uint64_t operand1Data, uint64_t operand2Data) { MI_ATOMIC cmd = Family::cmdInitAtomic; cmd.setAtomicOpcode(opcode); cmd.setDataSize(dataSize); EncodeAtomic::setMiAtomicAddress(cmd, writeAddress); cmd.setReturnDataControl(returnDataControl); cmd.setCsStall(csStall); if (opcode == ATOMIC_OPCODES::ATOMIC_4B_MOVE || opcode == ATOMIC_OPCODES::ATOMIC_8B_MOVE || opcode == ATOMIC_OPCODES::ATOMIC_8B_CMP_WR || opcode == ATOMIC_OPCODES::ATOMIC_8B_ADD) { cmd.setDwordLength(MI_ATOMIC::DWORD_LENGTH::DWORD_LENGTH_INLINE_DATA_1); cmd.setInlineData(0x1); cmd.setOperand1DataDword0(getLowPart(operand1Data)); cmd.setOperand1DataDword1(getHighPart(operand1Data)); cmd.setOperand2DataDword0(getLowPart(operand2Data)); cmd.setOperand2DataDword1(getHighPart(operand2Data)); } *atomic = cmd; } template void EncodeAtomic::programMiAtomic(LinearStream &commandStream, uint64_t writeAddress, ATOMIC_OPCODES opcode, DATA_SIZE dataSize, uint32_t returnDataControl, uint32_t csStall, uint64_t operand1Data, uint64_t operand2Data) { auto miAtomic = commandStream.getSpaceForCmd(); EncodeAtomic::programMiAtomic(miAtomic, writeAddress, opcode, dataSize, returnDataControl, csStall, operand1Data, operand2Data); } template void EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferStart(LinearStream &commandStream, uint64_t startAddress, uint64_t compareAddress, uint64_t compareData, CompareOperation compareOperation, bool indirect, bool useQwordData, bool isBcs) { EncodeSetMMIO::encodeMEM(commandStream, RegisterOffsets::csGprR7, compareAddress, isBcs); if (useQwordData) { EncodeSetMMIO::encodeMEM(commandStream, RegisterOffsets::csGprR7 + 4, compareAddress + 4, isBcs); } else { LriHelper::program(&commandStream, RegisterOffsets::csGprR7 + 4, 0, true, isBcs); } uint32_t compareDataLow = static_cast(compareData & std::numeric_limits::max()); uint32_t compareDataHigh = useQwordData ? static_cast(compareData >> 32) : 0; LriHelper::program(&commandStream, RegisterOffsets::csGprR8, compareDataLow, true, isBcs); LriHelper::program(&commandStream, RegisterOffsets::csGprR8 + 4, compareDataHigh, true, isBcs); programConditionalBatchBufferStartBase(commandStream, startAddress, AluRegisters::gpr7, AluRegisters::gpr8, compareOperation, indirect, isBcs); } template void EncodeBatchBufferStartOrEnd::programConditionalDataRegBatchBufferStart(LinearStream &commandStream, uint64_t startAddress, uint32_t compareReg, uint64_t compareData, CompareOperation compareOperation, bool indirect, bool useQwordData, bool isBcs) { EncodeSetMMIO::encodeREG(commandStream, RegisterOffsets::csGprR7, compareReg, isBcs); if (useQwordData) { EncodeSetMMIO::encodeREG(commandStream, RegisterOffsets::csGprR7 + 4, compareReg + 4, isBcs); } else { LriHelper::program(&commandStream, RegisterOffsets::csGprR7 + 4, 0, true, isBcs); } uint32_t compareDataLow = static_cast(compareData & std::numeric_limits::max()); uint32_t compareDataHigh = useQwordData ? static_cast(compareData >> 32) : 0; LriHelper::program(&commandStream, RegisterOffsets::csGprR8, compareDataLow, true, isBcs); LriHelper::program(&commandStream, RegisterOffsets::csGprR8 + 4, compareDataHigh, true, isBcs); programConditionalBatchBufferStartBase(commandStream, startAddress, AluRegisters::gpr7, AluRegisters::gpr8, compareOperation, indirect, isBcs); } template void EncodeBatchBufferStartOrEnd::programConditionalRegRegBatchBufferStart(LinearStream &commandStream, uint64_t startAddress, AluRegisters compareReg0, AluRegisters compareReg1, CompareOperation compareOperation, bool indirect, bool isBcs) { programConditionalBatchBufferStartBase(commandStream, startAddress, compareReg0, compareReg1, compareOperation, indirect, isBcs); } template void EncodeBatchBufferStartOrEnd::programConditionalRegMemBatchBufferStart(LinearStream &commandStream, uint64_t startAddress, uint64_t compareAddress, uint32_t compareReg, CompareOperation compareOperation, bool indirect, bool isBcs) { EncodeSetMMIO::encodeMEM(commandStream, RegisterOffsets::csGprR7, compareAddress, isBcs); LriHelper::program(&commandStream, RegisterOffsets::csGprR7 + 4, 0, true, isBcs); EncodeSetMMIO::encodeREG(commandStream, RegisterOffsets::csGprR8, compareReg, isBcs); LriHelper::program(&commandStream, RegisterOffsets::csGprR8 + 4, 0, true, isBcs); programConditionalBatchBufferStartBase(commandStream, startAddress, AluRegisters::gpr7, AluRegisters::gpr8, compareOperation, indirect, isBcs); } template void EncodeBatchBufferStartOrEnd::programConditionalBatchBufferStartBase(LinearStream &commandStream, uint64_t startAddress, AluRegisters regA, AluRegisters regB, CompareOperation compareOperation, bool indirect, bool isBcs) { EncodeAluHelper aluHelper; aluHelper.setNextAlu(AluRegisters::opcodeLoad, AluRegisters::srca, regA); aluHelper.setNextAlu(AluRegisters::opcodeLoad, AluRegisters::srcb, regB); aluHelper.setNextAlu(AluRegisters::opcodeSub); if ((compareOperation == CompareOperation::equal) || (compareOperation == CompareOperation::notEqual)) { aluHelper.setNextAlu(AluRegisters::opcodeStore, AluRegisters::gpr7, AluRegisters::zf); } else if ((compareOperation == CompareOperation::greaterOrEqual) || (compareOperation == CompareOperation::less)) { aluHelper.setNextAlu(AluRegisters::opcodeStore, AluRegisters::gpr7, AluRegisters::cf); } else { UNRECOVERABLE_IF(true); } aluHelper.copyToCmdStream(commandStream); EncodeSetMMIO::encodeREG(commandStream, RegisterOffsets::csPredicateResult2, RegisterOffsets::csGprR7, isBcs); MiPredicateType predicateType = MiPredicateType::noopOnResult2Clear; // Equal or Less if ((compareOperation == CompareOperation::notEqual) || (compareOperation == CompareOperation::greaterOrEqual)) { predicateType = MiPredicateType::noopOnResult2Set; } EncodeMiPredicate::encode(commandStream, predicateType); programBatchBufferStart(&commandStream, startAddress, false, indirect, true); EncodeMiPredicate::encode(commandStream, MiPredicateType::disable); } template void EncodeBatchBufferStartOrEnd::programBatchBufferStart(MI_BATCH_BUFFER_START *cmdBuffer, uint64_t address, bool secondLevel, bool indirect, bool predicate) { MI_BATCH_BUFFER_START cmd = Family::cmdInitBatchBufferStart; if (secondLevel) { cmd.setSecondLevelBatchBuffer(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER_SECOND_LEVEL_BATCH); } cmd.setAddressSpaceIndicator(MI_BATCH_BUFFER_START::ADDRESS_SPACE_INDICATOR_PPGTT); cmd.setBatchBufferStartAddress(address); appendBatchBufferStart(cmd, indirect, predicate); *cmdBuffer = cmd; } template void EncodeBatchBufferStartOrEnd::programBatchBufferStart(LinearStream *commandStream, uint64_t address, bool secondLevel, bool indirect, bool predicate) { programBatchBufferStart(commandStream->getSpaceForCmd(), address, secondLevel, indirect, predicate); } template void EncodeBatchBufferStartOrEnd::programBatchBufferEnd(LinearStream &commandStream) { MI_BATCH_BUFFER_END cmd = Family::cmdInitBatchBufferEnd; auto buffer = commandStream.getSpaceForCmd(); *buffer = cmd; } template void EncodeBatchBufferStartOrEnd::programBatchBufferEnd(CommandContainer &container) { programBatchBufferEnd(*container.getCommandStream()); } template void EncodeMiFlushDW::appendWa(LinearStream &commandStream, MiFlushArgs &args) { BlitCommandsHelper::dispatchDummyBlit(commandStream, args.waArgs); } template void EncodeMiFlushDW::programWithWa(LinearStream &commandStream, uint64_t immediateDataGpuAddress, uint64_t immediateData, MiFlushArgs &args) { UNRECOVERABLE_IF(args.waArgs.isWaRequired && !args.commandWithPostSync); appendWa(commandStream, args); args.waArgs.isWaRequired = false; auto miFlushDwCmd = commandStream.getSpaceForCmd(); MI_FLUSH_DW miFlush = Family::cmdInitMiFlushDw; if (args.commandWithPostSync) { auto postSyncType = args.timeStampOperation ? MI_FLUSH_DW::POST_SYNC_OPERATION_WRITE_TIMESTAMP_REGISTER : MI_FLUSH_DW::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA_QWORD; miFlush.setPostSyncOperation(postSyncType); miFlush.setDestinationAddress(immediateDataGpuAddress); miFlush.setImmediateData(immediateData); } miFlush.setNotifyEnable(args.notifyEnable); miFlush.setTlbInvalidate(args.tlbFlush); adjust(&miFlush, args.waArgs.rootDeviceEnvironment->getProductHelper()); *miFlushDwCmd = miFlush; } template size_t EncodeMiFlushDW::getWaSize(const EncodeDummyBlitWaArgs &waArgs) { return BlitCommandsHelper::getDummyBlitSize(waArgs); } template size_t EncodeMiFlushDW::getCommandSizeWithWa(const EncodeDummyBlitWaArgs &waArgs) { return sizeof(typename Family::MI_FLUSH_DW) + EncodeMiFlushDW::getWaSize(waArgs); } template inline void EncodeMemoryPrefetch::programMemoryPrefetch(LinearStream &commandStream, const GraphicsAllocation &graphicsAllocation, uint32_t size, size_t offset, const RootDeviceEnvironment &rootDeviceEnvironment) {} template inline size_t EncodeMemoryPrefetch::getSizeForMemoryPrefetch(size_t size, const RootDeviceEnvironment &rootDeviceEnvironment) { return 0u; } template void EncodeMiArbCheck::program(LinearStream &commandStream, std::optional preParserDisable) { MI_ARB_CHECK cmd = Family::cmdInitArbCheck; EncodeMiArbCheck::adjust(cmd, preParserDisable); auto miArbCheckStream = commandStream.getSpaceForCmd(); *miArbCheckStream = cmd; } template size_t EncodeMiArbCheck::getCommandSize() { return sizeof(MI_ARB_CHECK); } template inline void EncodeNoop::alignToCacheLine(LinearStream &commandStream) { auto used = commandStream.getUsed(); auto alignment = MemoryConstants::cacheLineSize; auto partialCacheline = used & (alignment - 1); if (partialCacheline) { auto amountToPad = alignment - partialCacheline; auto pCmd = commandStream.getSpace(amountToPad); memset(pCmd, 0, amountToPad); } } template inline void EncodeNoop::emitNoop(LinearStream &commandStream, size_t bytesToUpdate) { if (bytesToUpdate) { auto ptr = commandStream.getSpace(bytesToUpdate); memset(ptr, 0, bytesToUpdate); } } template inline void EncodeStoreMemory::programStoreDataImm(LinearStream &commandStream, uint64_t gpuAddress, uint32_t dataDword0, uint32_t dataDword1, bool storeQword, bool workloadPartitionOffset, void **outCmdPtr) { auto miStoreDataImmBuffer = commandStream.getSpaceForCmd(); if (outCmdPtr != nullptr) { *outCmdPtr = miStoreDataImmBuffer; } EncodeStoreMemory::programStoreDataImm(miStoreDataImmBuffer, gpuAddress, dataDword0, dataDword1, storeQword, workloadPartitionOffset); } template void EncodeEnableRayTracing::append3dStateBtd(void *ptr3dStateBtd) {} template inline void EncodeWA::setAdditionalPipeControlFlagsForNonPipelineStateCommand(PipeControlArgs &args) {} template size_t EncodeMemoryFence::getSystemMemoryFenceSize() { return 0; } template void EncodeMemoryFence::encodeSystemMemoryFence(LinearStream &commandStream, const GraphicsAllocation *globalFenceAllocation) { } template void EncodeMiPredicate::encode(LinearStream &cmdStream, [[maybe_unused]] MiPredicateType predicateType) { if constexpr (Family::isUsingMiSetPredicate) { using MI_SET_PREDICATE = typename Family::MI_SET_PREDICATE; using PREDICATE_ENABLE = typename MI_SET_PREDICATE::PREDICATE_ENABLE; auto miSetPredicate = Family::cmdInitSetPredicate; miSetPredicate.setPredicateEnable(static_cast(predicateType)); *cmdStream.getSpaceForCmd() = miSetPredicate; } } template void EnodeUserInterrupt::encode(LinearStream &commandStream) { *commandStream.getSpaceForCmd() = Family::cmdInitUserInterrupt; } } // namespace NEO