/* * Copyright (C) 2020-2021 Intel Corporation * * SPDX-License-Identifier: MIT * */ #pragma once #include "shared/source/command_container/command_encoder.h" #include "shared/source/command_stream/linear_stream.h" #include "shared/source/device/device.h" #include "shared/source/execution_environment/execution_environment.h" #include "shared/source/gmm_helper/gmm.h" #include "shared/source/gmm_helper/gmm_helper.h" #include "shared/source/helpers/api_specific_config.h" #include "shared/source/helpers/bindless_heaps_helper.h" #include "shared/source/helpers/hw_helper.h" #include "shared/source/helpers/local_id_gen.h" #include "shared/source/helpers/preamble.h" #include "shared/source/helpers/register_offsets.h" #include "shared/source/helpers/simd_helper.h" #include "shared/source/helpers/string.h" #include "shared/source/image/image_surface_state.h" #include "shared/source/kernel/dispatch_kernel_encoder_interface.h" #include "shared/source/kernel/kernel_descriptor.h" #include namespace NEO { template uint32_t EncodeStates::copySamplerState(IndirectHeap *dsh, uint32_t samplerStateOffset, uint32_t samplerCount, uint32_t borderColorOffset, const void *fnDynamicStateHeap, BindlessHeapsHelper *bindlessHeapHelper) { auto sizeSamplerState = sizeof(SAMPLER_STATE) * samplerCount; auto borderColorSize = samplerStateOffset - borderColorOffset; SAMPLER_STATE *dstSamplerState = nullptr; uint32_t samplerStateOffsetInDsh = 0; dsh->align(EncodeStates::alignIndirectStatePointer); uint32_t borderColorOffsetInDsh = 0; if (!ApiSpecificConfig::getBindlessConfiguration()) { borderColorOffsetInDsh = static_cast(dsh->getUsed()); auto borderColor = dsh->getSpace(borderColorSize); memcpy_s(borderColor, borderColorSize, ptrOffset(fnDynamicStateHeap, borderColorOffset), borderColorSize); dsh->align(INTERFACE_DESCRIPTOR_DATA::SAMPLERSTATEPOINTER_ALIGN_SIZE); samplerStateOffsetInDsh = static_cast(dsh->getUsed()); dstSamplerState = reinterpret_cast(dsh->getSpace(sizeSamplerState)); } else { auto borderColor = reinterpret_cast(ptrOffset(fnDynamicStateHeap, borderColorOffset)); if (borderColor->getBorderColorRed() != 0.0f || borderColor->getBorderColorGreen() != 0.0f || borderColor->getBorderColorBlue() != 0.0f || (borderColor->getBorderColorAlpha() != 0.0f && borderColor->getBorderColorAlpha() != 1.0f)) { UNRECOVERABLE_IF(true); } else if (borderColor->getBorderColorAlpha() == 0.0f) { borderColorOffsetInDsh = bindlessHeapHelper->getDefaultBorderColorOffset(); } else { borderColorOffsetInDsh = bindlessHeapHelper->getAlphaBorderColorOffset(); } dsh->align(INTERFACE_DESCRIPTOR_DATA::SAMPLERSTATEPOINTER_ALIGN_SIZE); auto samplerStateInDsh = bindlessHeapHelper->allocateSSInHeap(sizeSamplerState, nullptr, BindlessHeapsHelper::BindlesHeapType::GLOBAL_DSH); dstSamplerState = reinterpret_cast(samplerStateInDsh.ssPtr); samplerStateOffsetInDsh = static_cast(samplerStateInDsh.surfaceStateOffset); } auto srcSamplerState = reinterpret_cast(ptrOffset(fnDynamicStateHeap, samplerStateOffset)); SAMPLER_STATE state = {}; for (uint32_t i = 0; i < samplerCount; i++) { state = srcSamplerState[i]; state.setIndirectStatePointer(static_cast(borderColorOffsetInDsh)); dstSamplerState[i] = state; } return samplerStateOffsetInDsh; } // namespace NEO template size_t EncodeStates::getAdjustStateComputeModeSize() { return 0; } template void EncodeMathMMIO::encodeMulRegVal(CommandContainer &container, uint32_t offset, uint32_t val, uint64_t dstAddress) { int logLws = 0; int i = val; while (val >> logLws) { logLws++; } EncodeSetMMIO::encodeREG(container, CS_GPR_R0, offset); EncodeSetMMIO::encodeIMM(container, CS_GPR_R1, 0, true); i = 0; while (i < logLws) { if (val & (1 << i)) { EncodeMath::addition(container, AluRegisters::R_1, AluRegisters::R_0, AluRegisters::R_2); EncodeSetMMIO::encodeREG(container, CS_GPR_R1, CS_GPR_R2); } EncodeMath::addition(container, AluRegisters::R_0, AluRegisters::R_0, AluRegisters::R_2); EncodeSetMMIO::encodeREG(container, CS_GPR_R0, CS_GPR_R2); i++; } EncodeStoreMMIO::encode(*container.getCommandStream(), CS_GPR_R1, dstAddress); } /* * Compute *firstOperand > secondOperand and store the result in * MI_PREDICATE_RESULT where firstOperand is an device memory address. * * To calculate the "greater than" operation in the device, * (secondOperand - *firstOperand) is used, and if the carry flag register is * set, then (*firstOperand) is greater than secondOperand. */ template void EncodeMathMMIO::encodeGreaterThanPredicate(CommandContainer &container, uint64_t firstOperand, uint32_t secondOperand) { EncodeSetMMIO::encodeMEM(container, CS_GPR_R0, firstOperand); EncodeSetMMIO::encodeIMM(container, CS_GPR_R1, secondOperand, true); /* CS_GPR_R* registers map to AluRegisters::R_* registers */ EncodeMath::greaterThan(container, AluRegisters::R_0, AluRegisters::R_1, AluRegisters::R_2); EncodeSetMMIO::encodeREG(container, CS_PREDICATE_RESULT, CS_GPR_R2); } /* * Compute bitwise AND between a register value from regOffset and immVal * and store it into dstAddress. */ template void EncodeMathMMIO::encodeBitwiseAndVal(CommandContainer &container, uint32_t regOffset, uint32_t immVal, uint64_t dstAddress) { EncodeSetMMIO::encodeREG(container, CS_GPR_R0, regOffset); EncodeSetMMIO::encodeIMM(container, CS_GPR_R1, immVal, true); EncodeMath::bitwiseAnd(container, AluRegisters::R_0, AluRegisters::R_1, AluRegisters::R_2); EncodeStoreMMIO::encode(*container.getCommandStream(), CS_GPR_R2, dstAddress); } /* * encodeAlu() performs operations that leave a state including the result of * an operation such as the carry flag, and the accu flag with subtraction and * addition result. * * Parameter "postOperationStateRegister" is the ALU register with the result * from the operation that the function caller is interested in obtaining. * * Parameter "finalResultRegister" is the final destination register where * data from "postOperationStateRegister" will be copied. */ template void EncodeMathMMIO::encodeAlu(MI_MATH_ALU_INST_INLINE *pAluParam, AluRegisters srcA, AluRegisters srcB, AluRegisters op, AluRegisters finalResultRegister, AluRegisters postOperationStateRegister) { MI_MATH_ALU_INST_INLINE aluParam; aluParam.DW0.Value = 0x0; aluParam.DW0.BitField.ALUOpcode = static_cast(AluRegisters::OPCODE_LOAD); aluParam.DW0.BitField.Operand1 = static_cast(AluRegisters::R_SRCA); aluParam.DW0.BitField.Operand2 = static_cast(srcA); *pAluParam = aluParam; pAluParam++; aluParam.DW0.Value = 0x0; aluParam.DW0.BitField.ALUOpcode = static_cast(AluRegisters::OPCODE_LOAD); aluParam.DW0.BitField.Operand1 = static_cast(AluRegisters::R_SRCB); aluParam.DW0.BitField.Operand2 = static_cast(srcB); *pAluParam = aluParam; pAluParam++; /* Order of operation: Operand1 Operand2 */ aluParam.DW0.Value = 0x0; aluParam.DW0.BitField.ALUOpcode = static_cast(op); aluParam.DW0.BitField.Operand1 = 0; aluParam.DW0.BitField.Operand2 = 0; *pAluParam = aluParam; pAluParam++; aluParam.DW0.Value = 0x0; aluParam.DW0.BitField.ALUOpcode = static_cast(AluRegisters::OPCODE_STORE); aluParam.DW0.BitField.Operand1 = static_cast(finalResultRegister); aluParam.DW0.BitField.Operand2 = static_cast(postOperationStateRegister); *pAluParam = aluParam; pAluParam++; } template uint32_t *EncodeMath::commandReserve(CommandContainer &container) { size_t size = sizeof(MI_MATH) + sizeof(MI_MATH_ALU_INST_INLINE) * NUM_ALU_INST_FOR_READ_MODIFY_WRITE; auto cmd = reinterpret_cast(container.getCommandStream()->getSpace(size)); MI_MATH mathBuffer; mathBuffer.DW0.Value = 0x0; mathBuffer.DW0.BitField.InstructionType = MI_MATH::COMMAND_TYPE_MI_COMMAND; mathBuffer.DW0.BitField.InstructionOpcode = MI_MATH::MI_COMMAND_OPCODE_MI_MATH; mathBuffer.DW0.BitField.DwordLength = NUM_ALU_INST_FOR_READ_MODIFY_WRITE - 1; *reinterpret_cast(cmd) = mathBuffer; cmd++; return cmd; } template void EncodeMathMMIO::encodeAluAdd(MI_MATH_ALU_INST_INLINE *pAluParam, AluRegisters firstOperandRegister, AluRegisters secondOperandRegister, AluRegisters finalResultRegister) { encodeAlu(pAluParam, firstOperandRegister, secondOperandRegister, AluRegisters::OPCODE_ADD, finalResultRegister, AluRegisters::R_ACCU); } template void EncodeMathMMIO::encodeAluSubStoreCarry(MI_MATH_ALU_INST_INLINE *pAluParam, AluRegisters regA, AluRegisters regB, AluRegisters finalResultRegister) { /* regB is subtracted from regA */ encodeAlu(pAluParam, regA, regB, AluRegisters::OPCODE_SUB, finalResultRegister, AluRegisters::R_CF); } template void EncodeMathMMIO::encodeAluAnd(MI_MATH_ALU_INST_INLINE *pAluParam, AluRegisters firstOperandRegister, AluRegisters secondOperandRegister, AluRegisters finalResultRegister) { encodeAlu(pAluParam, firstOperandRegister, secondOperandRegister, AluRegisters::OPCODE_AND, finalResultRegister, AluRegisters::R_ACCU); } /* * greaterThan() tests if firstOperandRegister is greater than * secondOperandRegister. */ template void EncodeMath::greaterThan(CommandContainer &container, AluRegisters firstOperandRegister, AluRegisters secondOperandRegister, AluRegisters finalResultRegister) { uint32_t *cmd = EncodeMath::commandReserve(container); /* firstOperandRegister will be subtracted from secondOperandRegister */ EncodeMathMMIO::encodeAluSubStoreCarry(reinterpret_cast(cmd), secondOperandRegister, firstOperandRegister, finalResultRegister); } template void EncodeMath::addition(CommandContainer &container, AluRegisters firstOperandRegister, AluRegisters secondOperandRegister, AluRegisters finalResultRegister) { uint32_t *cmd = EncodeMath::commandReserve(container); EncodeMathMMIO::encodeAluAdd(reinterpret_cast(cmd), firstOperandRegister, secondOperandRegister, finalResultRegister); } template void EncodeMath::bitwiseAnd(CommandContainer &container, AluRegisters firstOperandRegister, AluRegisters secondOperandRegister, AluRegisters finalResultRegister) { uint32_t *cmd = EncodeMath::commandReserve(container); EncodeMathMMIO::encodeAluAnd(reinterpret_cast(cmd), firstOperandRegister, secondOperandRegister, finalResultRegister); } template inline void EncodeSetMMIO::encodeIMM(CommandContainer &container, uint32_t offset, uint32_t data, bool remap) { LriHelper::program(container.getCommandStream(), offset, data, remap); } template void EncodeSetMMIO::encodeMEM(CommandContainer &container, uint32_t offset, uint64_t address) { MI_LOAD_REGISTER_MEM cmd = Family::cmdInitLoadRegisterMem; cmd.setRegisterAddress(offset); cmd.setMemoryAddress(address); remapOffset(&cmd); auto buffer = container.getCommandStream()->getSpaceForCmd(); *buffer = cmd; } template void EncodeSetMMIO::encodeREG(CommandContainer &container, uint32_t dstOffset, uint32_t srcOffset) { MI_LOAD_REGISTER_REG cmd = Family::cmdInitLoadRegisterReg; cmd.setSourceRegisterAddress(srcOffset); cmd.setDestinationRegisterAddress(dstOffset); remapOffset(&cmd); auto buffer = container.getCommandStream()->getSpaceForCmd(); *buffer = cmd; } template void EncodeStoreMMIO::encode(LinearStream &csr, uint32_t offset, uint64_t address) { MI_STORE_REGISTER_MEM cmd = Family::cmdInitStoreRegisterMem; cmd.setRegisterAddress(offset); cmd.setMemoryAddress(address); remapOffset(&cmd); auto buffer = csr.getSpaceForCmd(); *buffer = cmd; } template void EncodeSurfaceState::encodeBuffer(void *dst, uint64_t address, size_t size, uint32_t mocs, bool cpuCoherent, bool forceNonAuxMode, bool isReadOnly, uint32_t numAvailableDevices, GraphicsAllocation *allocation, GmmHelper *gmmHelper, bool useGlobalAtomics, bool areMultipleSubDevicesInContext) { auto surfaceState = reinterpret_cast(dst); UNRECOVERABLE_IF(!isAligned(size)); SURFACE_STATE_BUFFER_LENGTH Length = {0}; Length.Length = static_cast(size - 1); surfaceState->setWidth(Length.SurfaceState.Width + 1); surfaceState->setHeight(Length.SurfaceState.Height + 1); surfaceState->setDepth(Length.SurfaceState.Depth + 1); surfaceState->setSurfaceType((address != 0) ? R_SURFACE_STATE::SURFACE_TYPE_SURFTYPE_BUFFER : R_SURFACE_STATE::SURFACE_TYPE_SURFTYPE_NULL); surfaceState->setSurfaceFormat(SURFACE_FORMAT::SURFACE_FORMAT_RAW); surfaceState->setSurfaceVerticalAlignment(R_SURFACE_STATE::SURFACE_VERTICAL_ALIGNMENT_VALIGN_4); surfaceState->setSurfaceHorizontalAlignment(R_SURFACE_STATE::SURFACE_HORIZONTAL_ALIGNMENT_HALIGN_4); surfaceState->setTileMode(R_SURFACE_STATE::TILE_MODE_LINEAR); surfaceState->setVerticalLineStride(0); surfaceState->setVerticalLineStrideOffset(0); surfaceState->setMemoryObjectControlState(mocs); surfaceState->setSurfaceBaseAddress(address); surfaceState->setAuxiliarySurfaceMode(AUXILIARY_SURFACE_MODE::AUXILIARY_SURFACE_MODE_AUX_NONE); setCoherencyType(surfaceState, cpuCoherent ? R_SURFACE_STATE::COHERENCY_TYPE_IA_COHERENT : R_SURFACE_STATE::COHERENCY_TYPE_GPU_COHERENT); Gmm *gmm = allocation ? allocation->getDefaultGmm() : nullptr; if (gmm && gmm->isRenderCompressed && !forceNonAuxMode) { // Its expected to not program pitch/qpitch/baseAddress for Aux surface in CCS scenarios setCoherencyType(surfaceState, R_SURFACE_STATE::COHERENCY_TYPE_GPU_COHERENT); setBufferAuxParamsForCCS(surfaceState); } if (DebugManager.flags.DisableCachingForStatefulBufferAccess.get()) { surfaceState->setMemoryObjectControlState(gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER_CACHELINE_MISALIGNED)); } EncodeSurfaceState::encodeExtraBufferParams(surfaceState, allocation, gmmHelper, isReadOnly, numAvailableDevices, useGlobalAtomics, areMultipleSubDevicesInContext); } template void EncodeSurfaceState::getSshAlignedPointer(uintptr_t &ptr, size_t &offset) { auto sshAlignmentMask = getSurfaceBaseAddressAlignmentMask(); uintptr_t alignedPtr = ptr & sshAlignmentMask; offset = 0; if (ptr != alignedPtr) { offset = ptrDiff(ptr, alignedPtr); ptr = alignedPtr; } } // Returned binding table pointer is relative to given heap (which is assumed to be the Surface state base addess) // as required by the INTERFACE_DESCRIPTOR_DATA. template size_t EncodeSurfaceState::pushBindingTableAndSurfaceStates(IndirectHeap &dstHeap, size_t bindingTableCount, const void *srcKernelSsh, size_t srcKernelSshSize, size_t numberOfBindingTableStates, size_t offsetOfBindingTable) { using BINDING_TABLE_STATE = typename Family::BINDING_TABLE_STATE; using INTERFACE_DESCRIPTOR_DATA = typename Family::INTERFACE_DESCRIPTOR_DATA; using RENDER_SURFACE_STATE = typename Family::RENDER_SURFACE_STATE; if (bindingTableCount == 0) { // according to compiler, kernel does not reference BTIs to stateful surfaces, so there's nothing to patch return 0; } size_t sshSize = srcKernelSshSize; DEBUG_BREAK_IF(srcKernelSsh == nullptr); auto srcSurfaceState = srcKernelSsh; // Allocate space for new ssh data auto dstSurfaceState = dstHeap.getSpace(sshSize); // Compiler sends BTI table that is already populated with surface state pointers relative to local SSH. // We may need to patch these pointers so that they are relative to surface state base address if (dstSurfaceState == dstHeap.getCpuBase()) { // nothing to patch, we're at the start of heap (which is assumed to be the surface state base address) // we need to simply copy the ssh (including BTIs from compiler) memcpy_s(dstSurfaceState, sshSize, srcSurfaceState, sshSize); return offsetOfBindingTable; } // We can copy-over the surface states, but BTIs will need to be patched memcpy_s(dstSurfaceState, sshSize, srcSurfaceState, offsetOfBindingTable); uint32_t surfaceStatesOffset = static_cast(ptrDiff(dstSurfaceState, dstHeap.getCpuBase())); // march over BTIs and offset the pointers based on surface state base address auto *dstBtiTableBase = reinterpret_cast(ptrOffset(dstSurfaceState, offsetOfBindingTable)); DEBUG_BREAK_IF(reinterpret_cast(dstBtiTableBase) % INTERFACE_DESCRIPTOR_DATA::BINDINGTABLEPOINTER_ALIGN_SIZE != 0); auto *srcBtiTableBase = reinterpret_cast(ptrOffset(srcSurfaceState, offsetOfBindingTable)); BINDING_TABLE_STATE bti = Family::cmdInitBindingTableState; for (uint32_t i = 0, e = static_cast(numberOfBindingTableStates); i != e; ++i) { uint32_t localSurfaceStateOffset = srcBtiTableBase[i].getSurfaceStatePointer(); uint32_t offsetedSurfaceStateOffset = localSurfaceStateOffset + surfaceStatesOffset; bti.setSurfaceStatePointer(offsetedSurfaceStateOffset); // patch just the SurfaceStatePointer bits dstBtiTableBase[i] = bti; DEBUG_BREAK_IF(bti.getRawData(0) % sizeof(BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE) != 0); } return ptrDiff(dstBtiTableBase, dstHeap.getCpuBase()); } template void EncodeSurfaceState::encodeExtraCacheSettings(R_SURFACE_STATE *surfaceState, const HardwareInfo &hwInfo) {} template void EncodeSurfaceState::setImageAuxParamsForCCS(R_SURFACE_STATE *surfaceState, Gmm *gmm) { using AUXILIARY_SURFACE_MODE = typename Family::RENDER_SURFACE_STATE::AUXILIARY_SURFACE_MODE; // Its expected to not program pitch/qpitch/baseAddress for Aux surface in CCS scenarios surfaceState->setAuxiliarySurfaceMode(AUXILIARY_SURFACE_MODE::AUXILIARY_SURFACE_MODE_AUX_CCS_E); setFlagsForMediaCompression(surfaceState, gmm); setClearColorParams(surfaceState, gmm); setUnifiedAuxBaseAddress(surfaceState, gmm); } template void EncodeSurfaceState::setBufferAuxParamsForCCS(R_SURFACE_STATE *surfaceState) { using AUXILIARY_SURFACE_MODE = typename R_SURFACE_STATE::AUXILIARY_SURFACE_MODE; surfaceState->setAuxiliarySurfaceMode(AUXILIARY_SURFACE_MODE::AUXILIARY_SURFACE_MODE_AUX_CCS_E); } template bool EncodeSurfaceState::isAuxModeEnabled(R_SURFACE_STATE *surfaceState, Gmm *gmm) { using AUXILIARY_SURFACE_MODE = typename R_SURFACE_STATE::AUXILIARY_SURFACE_MODE; return (surfaceState->getAuxiliarySurfaceMode() == AUXILIARY_SURFACE_MODE::AUXILIARY_SURFACE_MODE_AUX_CCS_E); } template void *EncodeDispatchKernel::getInterfaceDescriptor(CommandContainer &container, uint32_t &iddOffset) { if (container.nextIddInBlock == container.getNumIddPerBlock()) { if (ApiSpecificConfig::getBindlessConfiguration()) { container.getDevice()->getBindlessHeapsHelper()->getHeap(BindlessHeapsHelper::BindlesHeapType::GLOBAL_DSH)->align(EncodeStates::alignInterfaceDescriptorData); container.setIddBlock(container.getDevice()->getBindlessHeapsHelper()->getSpaceInHeap(sizeof(INTERFACE_DESCRIPTOR_DATA) * container.getNumIddPerBlock(), BindlessHeapsHelper::BindlesHeapType::GLOBAL_DSH)); } else { container.getIndirectHeap(HeapType::DYNAMIC_STATE)->align(EncodeStates::alignInterfaceDescriptorData); container.setIddBlock(container.getHeapSpaceAllowGrow(HeapType::DYNAMIC_STATE, sizeof(INTERFACE_DESCRIPTOR_DATA) * container.getNumIddPerBlock())); } container.nextIddInBlock = 0; EncodeMediaInterfaceDescriptorLoad::encode(container); } iddOffset = container.nextIddInBlock; auto interfaceDescriptorData = static_cast(container.getIddBlock()); return &interfaceDescriptorData[container.nextIddInBlock++]; } template bool EncodeDispatchKernel::inlineDataProgrammingRequired(const KernelDescriptor &kernelDesc) { auto checkKernelForInlineData = true; if (DebugManager.flags.EnablePassInlineData.get() != -1) { checkKernelForInlineData = !!DebugManager.flags.EnablePassInlineData.get(); } if (checkKernelForInlineData) { return kernelDesc.kernelAttributes.flags.passInlineData; } return false; } template void EncodeDispatchKernel::adjustTimestampPacket(WALKER_TYPE &walkerCmd, const HardwareInfo &hwInfo) {} template void EncodeIndirectParams::setGroupCountIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress) { for (int i = 0; i < 3; ++i) { if (NEO::isUndefinedOffset(offsets[i])) { continue; } EncodeStoreMMIO::encode(*container.getCommandStream(), GPUGPU_DISPATCHDIM[i], ptrOffset(reinterpret_cast(crossThreadAddress), offsets[i])); } } template void EncodeDispatchKernel::adjustBindingTablePrefetch(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t samplerCount, uint32_t bindingTableEntryCount) { auto enablePrefetch = EncodeSurfaceState::doBindingTablePrefetch(); if (DebugManager.flags.ForceBtpPrefetchMode.get() != -1) { enablePrefetch = static_cast(DebugManager.flags.ForceBtpPrefetchMode.get()); } if (enablePrefetch) { interfaceDescriptor.setSamplerCount(static_cast((samplerCount + 3) / 4)); interfaceDescriptor.setBindingTableEntryCount(std::min(bindingTableEntryCount, 31u)); } else { interfaceDescriptor.setSamplerCount(INTERFACE_DESCRIPTOR_DATA::SAMPLER_COUNT::SAMPLER_COUNT_NO_SAMPLERS_USED); interfaceDescriptor.setBindingTableEntryCount(0u); } } template void EncodeDispatchKernel::adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const HardwareInfo &hwInfo) {} template void EncodeIndirectParams::setGlobalWorkSizeIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress, const uint32_t *lws) { for (int i = 0; i < 3; ++i) { if (NEO::isUndefinedOffset(offsets[i])) { continue; } EncodeMathMMIO::encodeMulRegVal(container, GPUGPU_DISPATCHDIM[i], lws[i], ptrOffset(reinterpret_cast(crossThreadAddress), offsets[i])); } } template size_t EncodeIndirectParams::getCmdsSizeForIndirectParams() { return 3 * sizeof(typename Family::MI_LOAD_REGISTER_MEM); } template size_t EncodeIndirectParams::getCmdsSizeForSetGroupCountIndirect() { return 3 * (sizeof(MI_STORE_REGISTER_MEM)); } template size_t EncodeIndirectParams::getCmdsSizeForSetGroupSizeIndirect() { return 3 * (sizeof(MI_LOAD_REGISTER_REG) + sizeof(MI_LOAD_REGISTER_IMM) + sizeof(MI_MATH) + sizeof(MI_MATH_ALU_INST_INLINE) + sizeof(MI_STORE_REGISTER_MEM)); } template void EncodeSempahore::addMiSemaphoreWaitCommand(LinearStream &commandStream, uint64_t compareAddress, uint32_t compareData, COMPARE_OPERATION compareMode) { addMiSemaphoreWaitCommand(commandStream, compareAddress, compareData, compareMode, false); } template void EncodeSempahore::addMiSemaphoreWaitCommand(LinearStream &commandStream, uint64_t compareAddress, uint32_t compareData, COMPARE_OPERATION compareMode, bool registerPollMode) { auto semaphoreCommand = commandStream.getSpaceForCmd(); programMiSemaphoreWait(semaphoreCommand, compareAddress, compareData, compareMode, registerPollMode); } template size_t EncodeSempahore::getSizeMiSemaphoreWait() { return sizeof(MI_SEMAPHORE_WAIT); } template void EncodeAtomic::setMiAtomicAddress(MI_ATOMIC &atomic, uint64_t writeAddress) { atomic.setMemoryAddress(static_cast(writeAddress & 0x0000FFFFFFFFULL)); atomic.setMemoryAddressHigh(static_cast(writeAddress >> 32)); } template uint64_t EncodeAtomic::getMiAtomicAddress(MI_ATOMIC &atomic) { uint64_t address = (static_cast(atomic.getMemoryAddressHigh()) << 32) | (atomic.getMemoryAddress()); return address; } template void EncodeAtomic::programMiAtomic(MI_ATOMIC *atomic, uint64_t writeAddress, ATOMIC_OPCODES opcode, DATA_SIZE dataSize, uint32_t returnDataControl, uint32_t csStall, uint32_t operand1dword0, uint32_t operand1dword1) { MI_ATOMIC cmd = Family::cmdInitAtomic; cmd.setAtomicOpcode(opcode); cmd.setDataSize(dataSize); EncodeAtomic::setMiAtomicAddress(cmd, writeAddress); cmd.setReturnDataControl(returnDataControl); cmd.setCsStall(csStall); if (opcode == ATOMIC_OPCODES::ATOMIC_4B_MOVE || opcode == ATOMIC_OPCODES::ATOMIC_8B_MOVE) { cmd.setDwordLength(MI_ATOMIC::DWORD_LENGTH::DWORD_LENGTH_INLINE_DATA_1); cmd.setInlineData(0x1); cmd.setOperand1DataDword0(operand1dword0); cmd.setOperand1DataDword1(operand1dword1); } *atomic = cmd; } template void EncodeAtomic::programMiAtomic(LinearStream &commandStream, uint64_t writeAddress, ATOMIC_OPCODES opcode, DATA_SIZE dataSize, uint32_t returnDataControl, uint32_t csStall, uint32_t operand1dword0, uint32_t operand1dword1) { auto miAtomic = commandStream.getSpaceForCmd(); EncodeAtomic::programMiAtomic(miAtomic, writeAddress, opcode, dataSize, returnDataControl, csStall, operand1dword0, operand1dword1); } template void EncodeBatchBufferStartOrEnd::programBatchBufferStart(LinearStream *commandStream, uint64_t address, bool secondLevel) { MI_BATCH_BUFFER_START cmd = Family::cmdInitBatchBufferStart; if (secondLevel) { cmd.setSecondLevelBatchBuffer(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER_SECOND_LEVEL_BATCH); } cmd.setAddressSpaceIndicator(MI_BATCH_BUFFER_START::ADDRESS_SPACE_INDICATOR_PPGTT); cmd.setBatchBufferStartAddressGraphicsaddress472(address); auto buffer = commandStream->getSpaceForCmd(); *buffer = cmd; } template void EncodeBatchBufferStartOrEnd::programBatchBufferEnd(CommandContainer &container) { MI_BATCH_BUFFER_END cmd = Family::cmdInitBatchBufferEnd; auto buffer = container.getCommandStream()->getSpaceForCmd(); *buffer = cmd; } template void EncodeMiFlushDW::programMiFlushDw(LinearStream &commandStream, uint64_t immediateDataGpuAddress, uint64_t immediateData, bool timeStampOperation, bool commandWithPostSync) { programMiFlushDwWA(commandStream); auto miFlushDwCmd = commandStream.getSpaceForCmd(); MI_FLUSH_DW miFlush = GfxFamily::cmdInitMiFlushDw; if (commandWithPostSync) { auto postSyncType = timeStampOperation ? MI_FLUSH_DW::POST_SYNC_OPERATION_WRITE_TIMESTAMP_REGISTER : MI_FLUSH_DW::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA_QWORD; miFlush.setPostSyncOperation(postSyncType); miFlush.setDestinationAddress(immediateDataGpuAddress); miFlush.setImmediateData(immediateData); } appendMiFlushDw(&miFlush); *miFlushDwCmd = miFlush; } template size_t EncodeMiFlushDW::getMiFlushDwCmdSizeForDataWrite() { return sizeof(typename GfxFamily::MI_FLUSH_DW) + EncodeMiFlushDW::getMiFlushDwWaSize(); } template void EncodeMemoryPrefetch::programMemoryPrefetch(LinearStream &commandStream, const GraphicsAllocation &graphicsAllocation, uint32_t size, size_t offset, const HardwareInfo &hwInfo) {} template size_t EncodeMemoryPrefetch::getSizeForMemoryPrefetch(size_t size) { return 0u; } template void EncodeMiArbCheck::program(LinearStream &commandStream) { auto miArbCheckStream = commandStream.getSpaceForCmd(); *miArbCheckStream = Family::cmdInitArbCheck; } template size_t EncodeMiArbCheck::getCommandSize() { return sizeof(MI_ARB_CHECK); } } // namespace NEO