1283 lines
66 KiB
C++
1283 lines
66 KiB
C++
/*
|
|
* Copyright (C) 2020-2024 Intel Corporation
|
|
*
|
|
* SPDX-License-Identifier: MIT
|
|
*
|
|
*/
|
|
|
|
#pragma once
|
|
#include "shared/source/command_container/command_encoder.h"
|
|
#include "shared/source/command_container/implicit_scaling.h"
|
|
#include "shared/source/command_stream/linear_stream.h"
|
|
#include "shared/source/debugger/debugger_l0.h"
|
|
#include "shared/source/device/device.h"
|
|
#include "shared/source/execution_environment/execution_environment.h"
|
|
#include "shared/source/execution_environment/root_device_environment.h"
|
|
#include "shared/source/gmm_helper/gmm_helper.h"
|
|
#include "shared/source/helpers/api_specific_config.h"
|
|
#include "shared/source/helpers/bindless_heaps_helper.h"
|
|
#include "shared/source/helpers/blit_commands_helper.h"
|
|
#include "shared/source/helpers/definitions/command_encoder_args.h"
|
|
#include "shared/source/helpers/gfx_core_helper.h"
|
|
#include "shared/source/helpers/hw_info.h"
|
|
#include "shared/source/helpers/local_id_gen.h"
|
|
#include "shared/source/helpers/preamble.h"
|
|
#include "shared/source/helpers/register_offsets.h"
|
|
#include "shared/source/helpers/simd_helper.h"
|
|
#include "shared/source/helpers/string.h"
|
|
#include "shared/source/image/image_surface_state.h"
|
|
#include "shared/source/indirect_heap/indirect_heap.h"
|
|
#include "shared/source/kernel/dispatch_kernel_encoder_interface.h"
|
|
#include "shared/source/kernel/implicit_args_helper.h"
|
|
#include "shared/source/kernel/kernel_descriptor.h"
|
|
#include "shared/source/os_interface/product_helper.h"
|
|
#include "shared/source/program/kernel_info.h"
|
|
|
|
#include "encode_surface_state.inl"
|
|
#include "encode_surface_state_args.h"
|
|
|
|
#include <algorithm>
|
|
|
|
namespace NEO {
|
|
|
|
template <typename Family>
|
|
uint32_t EncodeStates<Family>::copySamplerState(IndirectHeap *dsh,
|
|
uint32_t samplerStateOffset,
|
|
uint32_t samplerCount,
|
|
uint32_t borderColorOffset,
|
|
const void *fnDynamicStateHeap,
|
|
BindlessHeapsHelper *bindlessHeapHelper,
|
|
const RootDeviceEnvironment &rootDeviceEnvironment) {
|
|
auto sizeSamplerState = sizeof(SAMPLER_STATE) * samplerCount;
|
|
auto borderColorSize = samplerStateOffset - borderColorOffset;
|
|
|
|
SAMPLER_STATE *dstSamplerState = nullptr;
|
|
uint32_t samplerStateOffsetInDsh = 0;
|
|
|
|
dsh->align(NEO::EncodeDispatchKernel<Family>::getDefaultDshAlignment());
|
|
uint32_t borderColorOffsetInDsh = 0;
|
|
if (!bindlessHeapHelper || (!bindlessHeapHelper->isGlobalDshSupported())) {
|
|
borderColorOffsetInDsh = static_cast<uint32_t>(dsh->getUsed());
|
|
// add offset of graphics allocation base address relative to heap base address
|
|
if (bindlessHeapHelper) {
|
|
borderColorOffsetInDsh += static_cast<uint32_t>(ptrDiff(dsh->getGpuBase(), bindlessHeapHelper->getGlobalHeapsBase()));
|
|
}
|
|
auto borderColor = dsh->getSpace(borderColorSize);
|
|
|
|
memcpy_s(borderColor, borderColorSize, ptrOffset(fnDynamicStateHeap, borderColorOffset),
|
|
borderColorSize);
|
|
|
|
dsh->align(INTERFACE_DESCRIPTOR_DATA::SAMPLERSTATEPOINTER_ALIGN_SIZE);
|
|
samplerStateOffsetInDsh = static_cast<uint32_t>(dsh->getUsed());
|
|
|
|
dstSamplerState = reinterpret_cast<SAMPLER_STATE *>(dsh->getSpace(sizeSamplerState));
|
|
} else {
|
|
auto borderColor = reinterpret_cast<const SAMPLER_BORDER_COLOR_STATE *>(ptrOffset(fnDynamicStateHeap, borderColorOffset));
|
|
if (borderColor->getBorderColorRed() != 0.0f ||
|
|
borderColor->getBorderColorGreen() != 0.0f ||
|
|
borderColor->getBorderColorBlue() != 0.0f ||
|
|
(borderColor->getBorderColorAlpha() != 0.0f && borderColor->getBorderColorAlpha() != 1.0f)) {
|
|
UNRECOVERABLE_IF(true);
|
|
} else if (borderColor->getBorderColorAlpha() == 0.0f) {
|
|
borderColorOffsetInDsh = bindlessHeapHelper->getDefaultBorderColorOffset();
|
|
} else {
|
|
borderColorOffsetInDsh = bindlessHeapHelper->getAlphaBorderColorOffset();
|
|
}
|
|
dsh->align(INTERFACE_DESCRIPTOR_DATA::SAMPLERSTATEPOINTER_ALIGN_SIZE);
|
|
auto samplerStateInDsh = bindlessHeapHelper->allocateSSInHeap(sizeSamplerState, nullptr, BindlessHeapsHelper::BindlesHeapType::globalDsh);
|
|
dstSamplerState = reinterpret_cast<SAMPLER_STATE *>(samplerStateInDsh.ssPtr);
|
|
samplerStateOffsetInDsh = static_cast<uint32_t>(samplerStateInDsh.surfaceStateOffset);
|
|
}
|
|
|
|
auto &helper = rootDeviceEnvironment.getHelper<ProductHelper>();
|
|
auto &hwInfo = *rootDeviceEnvironment.getHardwareInfo();
|
|
auto srcSamplerState = reinterpret_cast<const SAMPLER_STATE *>(ptrOffset(fnDynamicStateHeap, samplerStateOffset));
|
|
SAMPLER_STATE state = {};
|
|
for (uint32_t i = 0; i < samplerCount; i++) {
|
|
state = srcSamplerState[i];
|
|
state.setIndirectStatePointer(static_cast<uint32_t>(borderColorOffsetInDsh));
|
|
helper.adjustSamplerState(&state, hwInfo);
|
|
dstSamplerState[i] = state;
|
|
}
|
|
|
|
return samplerStateOffsetInDsh;
|
|
} // namespace NEO
|
|
|
|
template <typename Family>
|
|
void EncodeMathMMIO<Family>::encodeMulRegVal(CommandContainer &container, uint32_t offset, uint32_t val, uint64_t dstAddress, bool isBcs) {
|
|
int logLws = 0;
|
|
int i = val;
|
|
while (val >> logLws) {
|
|
logLws++;
|
|
}
|
|
|
|
EncodeSetMMIO<Family>::encodeREG(container, RegisterOffsets::csGprR0, offset, isBcs);
|
|
EncodeSetMMIO<Family>::encodeIMM(container, RegisterOffsets::csGprR1, 0, true, isBcs);
|
|
|
|
i = 0;
|
|
while (i < logLws) {
|
|
if (val & (1 << i)) {
|
|
EncodeMath<Family>::addition(container, AluRegisters::gpr1,
|
|
AluRegisters::gpr0, AluRegisters::gpr2);
|
|
EncodeSetMMIO<Family>::encodeREG(container, RegisterOffsets::csGprR1, RegisterOffsets::csGprR2, isBcs);
|
|
}
|
|
EncodeMath<Family>::addition(container, AluRegisters::gpr0,
|
|
AluRegisters::gpr0, AluRegisters::gpr2);
|
|
EncodeSetMMIO<Family>::encodeREG(container, RegisterOffsets::csGprR0, RegisterOffsets::csGprR2, isBcs);
|
|
i++;
|
|
}
|
|
EncodeStoreMMIO<Family>::encode(*container.getCommandStream(), RegisterOffsets::csGprR1, dstAddress, false, nullptr, isBcs);
|
|
}
|
|
|
|
/*
|
|
* Compute *firstOperand > secondOperand and store the result in
|
|
* MI_PREDICATE_RESULT where firstOperand is an device memory address.
|
|
*
|
|
* To calculate the "greater than" operation in the device,
|
|
* (secondOperand - *firstOperand) is used, and if the carry flag register is
|
|
* set, then (*firstOperand) is greater than secondOperand.
|
|
*/
|
|
template <typename Family>
|
|
void EncodeMathMMIO<Family>::encodeGreaterThanPredicate(CommandContainer &container, uint64_t firstOperand, uint32_t secondOperand, bool isBcs) {
|
|
EncodeSetMMIO<Family>::encodeMEM(container, RegisterOffsets::csGprR0, firstOperand, isBcs);
|
|
EncodeSetMMIO<Family>::encodeIMM(container, RegisterOffsets::csGprR1, secondOperand, true, isBcs);
|
|
|
|
/* RegisterOffsets::csGprR* registers map to AluRegisters::gpr* registers */
|
|
EncodeMath<Family>::greaterThan(container, AluRegisters::gpr0,
|
|
AluRegisters::gpr1, AluRegisters::gpr2);
|
|
|
|
EncodeSetMMIO<Family>::encodeREG(container, RegisterOffsets::csPredicateResult, RegisterOffsets::csGprR2, isBcs);
|
|
}
|
|
|
|
/*
|
|
* Compute bitwise AND between a register value from regOffset and immVal
|
|
* and store it into dstAddress.
|
|
*/
|
|
template <typename Family>
|
|
void EncodeMathMMIO<Family>::encodeBitwiseAndVal(CommandContainer &container, uint32_t regOffset, uint32_t immVal, uint64_t dstAddress,
|
|
bool workloadPartition, void **outCmdBuffer, bool isBcs) {
|
|
EncodeSetMMIO<Family>::encodeREG(container, RegisterOffsets::csGprR13, regOffset, isBcs);
|
|
EncodeSetMMIO<Family>::encodeIMM(container, RegisterOffsets::csGprR14, immVal, true, isBcs);
|
|
EncodeMath<Family>::bitwiseAnd(container, AluRegisters::gpr13,
|
|
AluRegisters::gpr14,
|
|
AluRegisters::gpr12);
|
|
EncodeStoreMMIO<Family>::encode(*container.getCommandStream(),
|
|
RegisterOffsets::csGprR12, dstAddress, workloadPartition, outCmdBuffer, isBcs);
|
|
}
|
|
|
|
/*
|
|
* encodeAlu() performs operations that leave a state including the result of
|
|
* an operation such as the carry flag, and the accu flag with subtraction and
|
|
* addition result.
|
|
*
|
|
* Parameter "postOperationStateRegister" is the ALU register with the result
|
|
* from the operation that the function caller is interested in obtaining.
|
|
*
|
|
* Parameter "finalResultRegister" is the final destination register where
|
|
* data from "postOperationStateRegister" will be copied.
|
|
*/
|
|
template <typename Family>
|
|
void EncodeMathMMIO<Family>::encodeAlu(MI_MATH_ALU_INST_INLINE *pAluParam, AluRegisters srcA, AluRegisters srcB, AluRegisters op, AluRegisters finalResultRegister, AluRegisters postOperationStateRegister) {
|
|
MI_MATH_ALU_INST_INLINE aluParam;
|
|
|
|
aluParam.DW0.Value = 0x0;
|
|
aluParam.DW0.BitField.ALUOpcode = static_cast<uint32_t>(AluRegisters::opcodeLoad);
|
|
aluParam.DW0.BitField.Operand1 = static_cast<uint32_t>(AluRegisters::srca);
|
|
aluParam.DW0.BitField.Operand2 = static_cast<uint32_t>(srcA);
|
|
*pAluParam = aluParam;
|
|
pAluParam++;
|
|
|
|
aluParam.DW0.Value = 0x0;
|
|
aluParam.DW0.BitField.ALUOpcode = static_cast<uint32_t>(AluRegisters::opcodeLoad);
|
|
aluParam.DW0.BitField.Operand1 = static_cast<uint32_t>(AluRegisters::srcb);
|
|
aluParam.DW0.BitField.Operand2 = static_cast<uint32_t>(srcB);
|
|
*pAluParam = aluParam;
|
|
pAluParam++;
|
|
|
|
/* Order of operation: Operand1 <ALUOpcode> Operand2 */
|
|
aluParam.DW0.Value = 0x0;
|
|
aluParam.DW0.BitField.ALUOpcode = static_cast<uint32_t>(op);
|
|
aluParam.DW0.BitField.Operand1 = 0;
|
|
aluParam.DW0.BitField.Operand2 = 0;
|
|
*pAluParam = aluParam;
|
|
pAluParam++;
|
|
|
|
aluParam.DW0.Value = 0x0;
|
|
aluParam.DW0.BitField.ALUOpcode = static_cast<uint32_t>(AluRegisters::opcodeStore);
|
|
aluParam.DW0.BitField.Operand1 = static_cast<uint32_t>(finalResultRegister);
|
|
aluParam.DW0.BitField.Operand2 = static_cast<uint32_t>(postOperationStateRegister);
|
|
*pAluParam = aluParam;
|
|
pAluParam++;
|
|
}
|
|
|
|
template <typename Family>
|
|
uint32_t *EncodeMath<Family>::commandReserve(CommandContainer &container) {
|
|
return commandReserve(*container.getCommandStream());
|
|
}
|
|
|
|
template <typename Family>
|
|
uint32_t *EncodeMath<Family>::commandReserve(LinearStream &cmdStream) {
|
|
size_t size = sizeof(MI_MATH) + sizeof(MI_MATH_ALU_INST_INLINE) * RegisterConstants::numAluInstForReadModifyWrite;
|
|
|
|
auto cmd = reinterpret_cast<uint32_t *>(cmdStream.getSpace(size));
|
|
MI_MATH mathBuffer;
|
|
mathBuffer.DW0.Value = 0x0;
|
|
mathBuffer.DW0.BitField.InstructionType = MI_MATH::COMMAND_TYPE_MI_COMMAND;
|
|
mathBuffer.DW0.BitField.InstructionOpcode = MI_MATH::MI_COMMAND_OPCODE_MI_MATH;
|
|
mathBuffer.DW0.BitField.DwordLength = RegisterConstants::numAluInstForReadModifyWrite - 1;
|
|
*reinterpret_cast<MI_MATH *>(cmd) = mathBuffer;
|
|
cmd++;
|
|
|
|
return cmd;
|
|
}
|
|
|
|
template <typename Family>
|
|
void EncodeMathMMIO<Family>::encodeAluAdd(MI_MATH_ALU_INST_INLINE *pAluParam,
|
|
AluRegisters firstOperandRegister,
|
|
AluRegisters secondOperandRegister,
|
|
AluRegisters finalResultRegister) {
|
|
encodeAlu(pAluParam, firstOperandRegister, secondOperandRegister, AluRegisters::opcodeAdd, finalResultRegister, AluRegisters::accu);
|
|
}
|
|
|
|
template <typename Family>
|
|
void EncodeMathMMIO<Family>::encodeAluSubStoreCarry(MI_MATH_ALU_INST_INLINE *pAluParam, AluRegisters regA, AluRegisters regB, AluRegisters finalResultRegister) {
|
|
/* regB is subtracted from regA */
|
|
encodeAlu(pAluParam, regA, regB, AluRegisters::opcodeSub, finalResultRegister, AluRegisters::cf);
|
|
}
|
|
|
|
template <typename Family>
|
|
void EncodeMathMMIO<Family>::encodeAluAnd(MI_MATH_ALU_INST_INLINE *pAluParam,
|
|
AluRegisters firstOperandRegister,
|
|
AluRegisters secondOperandRegister,
|
|
AluRegisters finalResultRegister) {
|
|
encodeAlu(pAluParam, firstOperandRegister, secondOperandRegister, AluRegisters::opcodeAnd, finalResultRegister, AluRegisters::accu);
|
|
}
|
|
|
|
template <typename Family>
|
|
void EncodeMathMMIO<Family>::encodeIncrementOrDecrement(LinearStream &cmdStream, AluRegisters operandRegister, IncrementOrDecrementOperation operationType, bool isBcs) {
|
|
LriHelper<Family>::program(&cmdStream, RegisterOffsets::csGprR7, 1, true, isBcs);
|
|
LriHelper<Family>::program(&cmdStream, RegisterOffsets::csGprR7 + 4, 0, true, isBcs);
|
|
|
|
EncodeAluHelper<Family, 4> aluHelper;
|
|
aluHelper.setNextAlu(AluRegisters::opcodeLoad, AluRegisters::srca, operandRegister);
|
|
aluHelper.setNextAlu(AluRegisters::opcodeLoad, AluRegisters::srcb, AluRegisters::gpr7);
|
|
aluHelper.setNextAlu((operationType == IncrementOrDecrementOperation::increment) ? AluRegisters::opcodeAdd
|
|
: AluRegisters::opcodeSub);
|
|
aluHelper.setNextAlu(AluRegisters::opcodeStore, operandRegister, AluRegisters::accu);
|
|
|
|
aluHelper.copyToCmdStream(cmdStream);
|
|
}
|
|
|
|
template <typename Family>
|
|
void EncodeMathMMIO<Family>::encodeIncrement(LinearStream &cmdStream, AluRegisters operandRegister, bool isBcs) {
|
|
encodeIncrementOrDecrement(cmdStream, operandRegister, IncrementOrDecrementOperation::increment, isBcs);
|
|
}
|
|
|
|
template <typename Family>
|
|
void EncodeMathMMIO<Family>::encodeDecrement(LinearStream &cmdStream, AluRegisters operandRegister, bool isBcs) {
|
|
encodeIncrementOrDecrement(cmdStream, operandRegister, IncrementOrDecrementOperation::decrement, isBcs);
|
|
}
|
|
|
|
/*
|
|
* greaterThan() tests if firstOperandRegister is greater than
|
|
* secondOperandRegister.
|
|
*/
|
|
template <typename Family>
|
|
void EncodeMath<Family>::greaterThan(CommandContainer &container,
|
|
AluRegisters firstOperandRegister,
|
|
AluRegisters secondOperandRegister,
|
|
AluRegisters finalResultRegister) {
|
|
uint32_t *cmd = EncodeMath<Family>::commandReserve(container);
|
|
|
|
/* firstOperandRegister will be subtracted from secondOperandRegister */
|
|
EncodeMathMMIO<Family>::encodeAluSubStoreCarry(reinterpret_cast<MI_MATH_ALU_INST_INLINE *>(cmd),
|
|
secondOperandRegister,
|
|
firstOperandRegister,
|
|
finalResultRegister);
|
|
}
|
|
|
|
template <typename Family>
|
|
void EncodeMath<Family>::addition(CommandContainer &container,
|
|
AluRegisters firstOperandRegister,
|
|
AluRegisters secondOperandRegister,
|
|
AluRegisters finalResultRegister) {
|
|
uint32_t *cmd = EncodeMath<Family>::commandReserve(container);
|
|
|
|
EncodeMathMMIO<Family>::encodeAluAdd(reinterpret_cast<MI_MATH_ALU_INST_INLINE *>(cmd),
|
|
firstOperandRegister,
|
|
secondOperandRegister,
|
|
finalResultRegister);
|
|
}
|
|
|
|
template <typename Family>
|
|
void EncodeMath<Family>::addition(LinearStream &cmdStream,
|
|
AluRegisters firstOperandRegister,
|
|
AluRegisters secondOperandRegister,
|
|
AluRegisters finalResultRegister) {
|
|
uint32_t *cmd = EncodeMath<Family>::commandReserve(cmdStream);
|
|
|
|
EncodeMathMMIO<Family>::encodeAluAdd(reinterpret_cast<MI_MATH_ALU_INST_INLINE *>(cmd),
|
|
firstOperandRegister,
|
|
secondOperandRegister,
|
|
finalResultRegister);
|
|
}
|
|
|
|
template <typename Family>
|
|
void EncodeMath<Family>::bitwiseAnd(CommandContainer &container,
|
|
AluRegisters firstOperandRegister,
|
|
AluRegisters secondOperandRegister,
|
|
AluRegisters finalResultRegister) {
|
|
uint32_t *cmd = EncodeMath<Family>::commandReserve(container);
|
|
|
|
EncodeMathMMIO<Family>::encodeAluAnd(reinterpret_cast<MI_MATH_ALU_INST_INLINE *>(cmd),
|
|
firstOperandRegister,
|
|
secondOperandRegister,
|
|
finalResultRegister);
|
|
}
|
|
|
|
template <typename Family>
|
|
inline void EncodeSetMMIO<Family>::encodeIMM(CommandContainer &container, uint32_t offset, uint32_t data, bool remap, bool isBcs) {
|
|
EncodeSetMMIO<Family>::encodeIMM(*container.getCommandStream(), offset, data, remap, isBcs);
|
|
}
|
|
|
|
template <typename Family>
|
|
inline void EncodeSetMMIO<Family>::encodeMEM(CommandContainer &container, uint32_t offset, uint64_t address, bool isBcs) {
|
|
EncodeSetMMIO<Family>::encodeMEM(*container.getCommandStream(), offset, address, isBcs);
|
|
}
|
|
|
|
template <typename Family>
|
|
inline void EncodeSetMMIO<Family>::encodeREG(CommandContainer &container, uint32_t dstOffset, uint32_t srcOffset, bool isBcs) {
|
|
EncodeSetMMIO<Family>::encodeREG(*container.getCommandStream(), dstOffset, srcOffset, isBcs);
|
|
}
|
|
|
|
template <typename Family>
|
|
inline void EncodeSetMMIO<Family>::encodeIMM(LinearStream &cmdStream, uint32_t offset, uint32_t data, bool remap, bool isBcs) {
|
|
LriHelper<Family>::program(&cmdStream,
|
|
offset,
|
|
data,
|
|
remap,
|
|
isBcs);
|
|
}
|
|
|
|
template <typename Family>
|
|
inline void EncodeStateBaseAddress<Family>::setSbaTrackingForL0DebuggerIfEnabled(bool trackingEnabled,
|
|
Device &device,
|
|
LinearStream &commandStream,
|
|
STATE_BASE_ADDRESS &sbaCmd, bool useFirstLevelBB) {
|
|
if (!trackingEnabled) {
|
|
return;
|
|
}
|
|
NEO::Debugger::SbaAddresses sbaAddresses = {};
|
|
NEO::EncodeStateBaseAddress<Family>::setSbaAddressesForDebugger(sbaAddresses, sbaCmd);
|
|
device.getL0Debugger()->captureStateBaseAddress(commandStream, sbaAddresses, useFirstLevelBB);
|
|
}
|
|
|
|
template <typename Family>
|
|
void EncodeSetMMIO<Family>::encodeMEM(LinearStream &cmdStream, uint32_t offset, uint64_t address, bool isBcs) {
|
|
MI_LOAD_REGISTER_MEM cmd = Family::cmdInitLoadRegisterMem;
|
|
cmd.setRegisterAddress(offset);
|
|
cmd.setMemoryAddress(address);
|
|
remapOffset(&cmd);
|
|
if (isBcs) {
|
|
cmd.setRegisterAddress(offset + RegisterOffsets::bcs0Base);
|
|
}
|
|
|
|
auto buffer = cmdStream.getSpaceForCmd<MI_LOAD_REGISTER_MEM>();
|
|
*buffer = cmd;
|
|
}
|
|
|
|
template <typename Family>
|
|
void EncodeSetMMIO<Family>::encodeREG(LinearStream &cmdStream, uint32_t dstOffset, uint32_t srcOffset, bool isBcs) {
|
|
MI_LOAD_REGISTER_REG cmd = Family::cmdInitLoadRegisterReg;
|
|
cmd.setSourceRegisterAddress(srcOffset);
|
|
cmd.setDestinationRegisterAddress(dstOffset);
|
|
remapOffset(&cmd);
|
|
if (isBcs) {
|
|
cmd.setSourceRegisterAddress(srcOffset + RegisterOffsets::bcs0Base);
|
|
cmd.setDestinationRegisterAddress(dstOffset + RegisterOffsets::bcs0Base);
|
|
}
|
|
auto buffer = cmdStream.getSpaceForCmd<MI_LOAD_REGISTER_REG>();
|
|
*buffer = cmd;
|
|
}
|
|
|
|
template <typename Family>
|
|
void EncodeStoreMMIO<Family>::encode(LinearStream &csr, uint32_t offset, uint64_t address, bool workloadPartition, void **outCmdBuffer, bool isBcs) {
|
|
auto buffer = csr.getSpaceForCmd<MI_STORE_REGISTER_MEM>();
|
|
if (outCmdBuffer != nullptr) {
|
|
*outCmdBuffer = buffer;
|
|
}
|
|
EncodeStoreMMIO<Family>::encode(buffer, offset, address, workloadPartition, isBcs);
|
|
}
|
|
|
|
template <typename Family>
|
|
inline void EncodeStoreMMIO<Family>::encode(MI_STORE_REGISTER_MEM *cmdBuffer, uint32_t offset, uint64_t address, bool workloadPartition, bool isBcs) {
|
|
MI_STORE_REGISTER_MEM cmd = Family::cmdInitStoreRegisterMem;
|
|
cmd.setRegisterAddress(offset);
|
|
cmd.setMemoryAddress(address);
|
|
appendFlags(&cmd, workloadPartition);
|
|
if (isBcs) {
|
|
cmd.setRegisterAddress(offset + RegisterOffsets::bcs0Base);
|
|
}
|
|
*cmdBuffer = cmd;
|
|
}
|
|
|
|
template <typename Family>
|
|
void EncodeSurfaceState<Family>::encodeBuffer(EncodeSurfaceStateArgs &args) {
|
|
auto surfaceState = reinterpret_cast<R_SURFACE_STATE *>(args.outMemory);
|
|
uint64_t bufferSize = alignUp(args.size, getSurfaceBaseAddressAlignment());
|
|
bufferSize = std::min(bufferSize, static_cast<uint64_t>(MemoryConstants::fullStatefulRegion - 1));
|
|
SurfaceStateBufferLength length = {0};
|
|
length.length = static_cast<uint32_t>(bufferSize - 1);
|
|
|
|
surfaceState->setWidth(length.surfaceState.width + 1);
|
|
surfaceState->setHeight(length.surfaceState.height + 1);
|
|
surfaceState->setDepth(length.surfaceState.depth + 1);
|
|
|
|
surfaceState->setSurfaceType((args.graphicsAddress != 0) ? R_SURFACE_STATE::SURFACE_TYPE_SURFTYPE_BUFFER
|
|
: R_SURFACE_STATE::SURFACE_TYPE_SURFTYPE_NULL);
|
|
surfaceState->setSurfaceFormat(SURFACE_FORMAT::SURFACE_FORMAT_RAW);
|
|
surfaceState->setSurfaceVerticalAlignment(R_SURFACE_STATE::SURFACE_VERTICAL_ALIGNMENT_VALIGN_4);
|
|
surfaceState->setSurfaceHorizontalAlignment(R_SURFACE_STATE::SURFACE_HORIZONTAL_ALIGNMENT_HALIGN_DEFAULT);
|
|
|
|
surfaceState->setTileMode(R_SURFACE_STATE::TILE_MODE_LINEAR);
|
|
surfaceState->setVerticalLineStride(0);
|
|
surfaceState->setVerticalLineStrideOffset(0);
|
|
surfaceState->setMemoryObjectControlState(args.mocs);
|
|
surfaceState->setSurfaceBaseAddress(args.graphicsAddress);
|
|
|
|
surfaceState->setAuxiliarySurfaceMode(AUXILIARY_SURFACE_MODE::AUXILIARY_SURFACE_MODE_AUX_NONE);
|
|
|
|
setCoherencyType(surfaceState, args.cpuCoherent ? R_SURFACE_STATE::COHERENCY_TYPE_IA_COHERENT : R_SURFACE_STATE::COHERENCY_TYPE_GPU_COHERENT);
|
|
|
|
auto compressionEnabled = args.allocation ? args.allocation->isCompressionEnabled() : false;
|
|
if (compressionEnabled && !args.forceNonAuxMode) {
|
|
// Its expected to not program pitch/qpitch/baseAddress for Aux surface in CCS scenarios
|
|
setCoherencyType(surfaceState, R_SURFACE_STATE::COHERENCY_TYPE_GPU_COHERENT);
|
|
setBufferAuxParamsForCCS(surfaceState);
|
|
}
|
|
|
|
if (debugManager.flags.DisableCachingForStatefulBufferAccess.get()) {
|
|
surfaceState->setMemoryObjectControlState(args.gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER_CACHELINE_MISALIGNED));
|
|
}
|
|
|
|
EncodeSurfaceState<Family>::encodeExtraBufferParams(args);
|
|
|
|
EncodeSurfaceState<Family>::appendBufferSurfaceState(args);
|
|
}
|
|
|
|
template <typename Family>
|
|
void EncodeSurfaceState<Family>::getSshAlignedPointer(uintptr_t &ptr, size_t &offset) {
|
|
auto sshAlignmentMask =
|
|
getSurfaceBaseAddressAlignmentMask();
|
|
uintptr_t alignedPtr = ptr & sshAlignmentMask;
|
|
|
|
offset = 0;
|
|
if (ptr != alignedPtr) {
|
|
offset = ptrDiff(ptr, alignedPtr);
|
|
ptr = alignedPtr;
|
|
}
|
|
}
|
|
|
|
// Returned binding table pointer is relative to given heap (which is assumed to be the Surface state base addess)
|
|
// as required by the INTERFACE_DESCRIPTOR_DATA.
|
|
template <typename Family>
|
|
size_t EncodeSurfaceState<Family>::pushBindingTableAndSurfaceStates(IndirectHeap &dstHeap,
|
|
const void *srcKernelSsh, size_t srcKernelSshSize,
|
|
size_t numberOfBindingTableStates, size_t offsetOfBindingTable) {
|
|
using BINDING_TABLE_STATE = typename Family::BINDING_TABLE_STATE;
|
|
using INTERFACE_DESCRIPTOR_DATA = typename Family::INTERFACE_DESCRIPTOR_DATA;
|
|
using RENDER_SURFACE_STATE = typename Family::RENDER_SURFACE_STATE;
|
|
|
|
size_t sshSize = srcKernelSshSize;
|
|
DEBUG_BREAK_IF(srcKernelSsh == nullptr);
|
|
|
|
auto srcSurfaceState = srcKernelSsh;
|
|
// Allocate space for new ssh data
|
|
auto dstSurfaceState = dstHeap.getSpace(sshSize);
|
|
|
|
// Compiler sends BTI table that is already populated with surface state pointers relative to local SSH.
|
|
// We may need to patch these pointers so that they are relative to surface state base address
|
|
if (dstSurfaceState == dstHeap.getCpuBase()) {
|
|
// nothing to patch, we're at the start of heap (which is assumed to be the surface state base address)
|
|
// we need to simply copy the ssh (including BTIs from compiler)
|
|
memcpy_s(dstSurfaceState, sshSize, srcSurfaceState, sshSize);
|
|
return offsetOfBindingTable;
|
|
}
|
|
|
|
// We can copy-over the surface states, but BTIs will need to be patched
|
|
memcpy_s(dstSurfaceState, sshSize, srcSurfaceState, offsetOfBindingTable);
|
|
|
|
uint32_t surfaceStatesOffset = static_cast<uint32_t>(ptrDiff(dstSurfaceState, dstHeap.getCpuBase()));
|
|
|
|
// march over BTIs and offset the pointers based on surface state base address
|
|
auto *dstBtiTableBase = reinterpret_cast<BINDING_TABLE_STATE *>(ptrOffset(dstSurfaceState, offsetOfBindingTable));
|
|
DEBUG_BREAK_IF(reinterpret_cast<uintptr_t>(dstBtiTableBase) % INTERFACE_DESCRIPTOR_DATA::BINDINGTABLEPOINTER_ALIGN_SIZE != 0);
|
|
auto *srcBtiTableBase = reinterpret_cast<const BINDING_TABLE_STATE *>(ptrOffset(srcSurfaceState, offsetOfBindingTable));
|
|
BINDING_TABLE_STATE bti = Family::cmdInitBindingTableState;
|
|
for (uint32_t i = 0, e = static_cast<uint32_t>(numberOfBindingTableStates); i != e; ++i) {
|
|
uint32_t localSurfaceStateOffset = srcBtiTableBase[i].getSurfaceStatePointer();
|
|
uint32_t offsetedSurfaceStateOffset = localSurfaceStateOffset + surfaceStatesOffset;
|
|
bti.setSurfaceStatePointer(offsetedSurfaceStateOffset); // patch just the SurfaceStatePointer bits
|
|
dstBtiTableBase[i] = bti;
|
|
DEBUG_BREAK_IF(bti.getRawData(0) % sizeof(BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE) != 0);
|
|
}
|
|
|
|
return ptrDiff(dstBtiTableBase, dstHeap.getCpuBase());
|
|
}
|
|
|
|
template <typename Family>
|
|
inline void EncodeSurfaceState<Family>::encodeExtraCacheSettings(R_SURFACE_STATE *surfaceState, const EncodeSurfaceStateArgs &args) {}
|
|
|
|
template <typename Family>
|
|
void EncodeSurfaceState<Family>::setImageAuxParamsForCCS(R_SURFACE_STATE *surfaceState, Gmm *gmm) {
|
|
using AUXILIARY_SURFACE_MODE = typename Family::RENDER_SURFACE_STATE::AUXILIARY_SURFACE_MODE;
|
|
// Its expected to not program pitch/qpitch/baseAddress for Aux surface in CCS scenarios
|
|
surfaceState->setAuxiliarySurfaceMode(AUXILIARY_SURFACE_MODE::AUXILIARY_SURFACE_MODE_AUX_CCS_E);
|
|
setFlagsForMediaCompression(surfaceState, gmm);
|
|
|
|
setClearColorParams(surfaceState, gmm);
|
|
setUnifiedAuxBaseAddress<Family>(surfaceState, gmm);
|
|
}
|
|
|
|
template <typename Family>
|
|
void EncodeSurfaceState<Family>::setBufferAuxParamsForCCS(R_SURFACE_STATE *surfaceState) {
|
|
using AUXILIARY_SURFACE_MODE = typename R_SURFACE_STATE::AUXILIARY_SURFACE_MODE;
|
|
|
|
surfaceState->setAuxiliarySurfaceMode(AUXILIARY_SURFACE_MODE::AUXILIARY_SURFACE_MODE_AUX_CCS_E);
|
|
}
|
|
|
|
template <typename Family>
|
|
bool EncodeSurfaceState<Family>::isAuxModeEnabled(R_SURFACE_STATE *surfaceState, Gmm *gmm) {
|
|
using AUXILIARY_SURFACE_MODE = typename R_SURFACE_STATE::AUXILIARY_SURFACE_MODE;
|
|
|
|
return (surfaceState->getAuxiliarySurfaceMode() == AUXILIARY_SURFACE_MODE::AUXILIARY_SURFACE_MODE_AUX_CCS_E);
|
|
}
|
|
|
|
template <typename Family>
|
|
void EncodeSurfaceState<Family>::appendParamsForImageFromBuffer(R_SURFACE_STATE *surfaceState) {
|
|
}
|
|
|
|
template <typename Family>
|
|
inline void EncodeDispatchKernel<Family>::encodeCommon(CommandContainer &container, EncodeDispatchKernelArgs &args) {
|
|
using DefaultWalkerType = typename Family::DefaultWalkerType;
|
|
EncodeDispatchKernel<Family>::template encode<DefaultWalkerType>(container, args);
|
|
}
|
|
|
|
template <typename Family>
|
|
void *EncodeDispatchKernel<Family>::getInterfaceDescriptor(CommandContainer &container, IndirectHeap *childDsh, uint32_t &iddOffset) {
|
|
|
|
if (container.nextIddInBlockRef() == container.getNumIddPerBlock()) {
|
|
|
|
void *heapPointer = nullptr;
|
|
size_t heapSize = sizeof(INTERFACE_DESCRIPTOR_DATA) * container.getNumIddPerBlock();
|
|
if (childDsh != nullptr) {
|
|
childDsh->align(NEO::EncodeDispatchKernel<Family>::getDefaultDshAlignment());
|
|
heapPointer = childDsh->getSpace(heapSize);
|
|
} else {
|
|
container.getIndirectHeap(HeapType::dynamicState)->align(NEO::EncodeDispatchKernel<Family>::getDefaultDshAlignment());
|
|
heapPointer = container.getHeapSpaceAllowGrow(HeapType::dynamicState, heapSize);
|
|
}
|
|
container.setIddBlock(heapPointer);
|
|
container.nextIddInBlockRef() = 0;
|
|
}
|
|
|
|
iddOffset = container.nextIddInBlockRef();
|
|
auto interfaceDescriptorData = static_cast<INTERFACE_DESCRIPTOR_DATA *>(container.getIddBlock());
|
|
container.nextIddInBlockRef()++;
|
|
return &interfaceDescriptorData[iddOffset];
|
|
}
|
|
|
|
template <typename Family>
|
|
bool EncodeDispatchKernel<Family>::inlineDataProgrammingRequired(const KernelDescriptor &kernelDesc) {
|
|
auto checkKernelForInlineData = true;
|
|
if (debugManager.flags.EnablePassInlineData.get() != -1) {
|
|
checkKernelForInlineData = !!debugManager.flags.EnablePassInlineData.get();
|
|
}
|
|
if (checkKernelForInlineData) {
|
|
return kernelDesc.kernelAttributes.flags.passInlineData;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
template <typename Family>
|
|
template <typename InterfaceDescriptorType>
|
|
void EncodeDispatchKernel<Family>::encodeEuSchedulingPolicy(InterfaceDescriptorType *pInterfaceDescriptor, const KernelDescriptor &kernelDesc, int32_t defaultPipelinedThreadArbitrationPolicy) {
|
|
}
|
|
|
|
template <typename Family>
|
|
template <typename WalkerType>
|
|
void EncodeDispatchKernel<Family>::adjustTimestampPacket(WalkerType &walkerCmd, const EncodeDispatchKernelArgs &args) {}
|
|
|
|
template <typename Family>
|
|
template <typename WalkerType>
|
|
void EncodeDispatchKernel<Family>::setWalkerRegionSettings(WalkerType &walkerCmd, const HardwareInfo &hwInfo, uint32_t partitionCount, uint32_t workgroupSize, uint32_t maxWgCountPerTile, bool requiredWalkOrder) {}
|
|
|
|
template <typename Family>
|
|
void EncodeIndirectParams<Family>::encode(CommandContainer &container, uint64_t crossThreadDataGpuVa, DispatchKernelEncoderI *dispatchInterface, uint64_t implicitArgsGpuPtr) {
|
|
const auto &kernelDescriptor = dispatchInterface->getKernelDescriptor();
|
|
setGroupCountIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups, crossThreadDataGpuVa);
|
|
setGlobalWorkSizeIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize, crossThreadDataGpuVa, dispatchInterface->getGroupSize());
|
|
UNRECOVERABLE_IF(NEO::isValidOffset(kernelDescriptor.payloadMappings.dispatchTraits.workDim) && (kernelDescriptor.payloadMappings.dispatchTraits.workDim & 0b11) != 0u);
|
|
setWorkDimIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.workDim, crossThreadDataGpuVa, dispatchInterface->getGroupSize());
|
|
if (implicitArgsGpuPtr) {
|
|
CrossThreadDataOffset groupCountOffset[] = {offsetof(ImplicitArgs, groupCountX), offsetof(ImplicitArgs, groupCountY), offsetof(ImplicitArgs, groupCountZ)};
|
|
CrossThreadDataOffset globalSizeOffset[] = {offsetof(ImplicitArgs, globalSizeX), offsetof(ImplicitArgs, globalSizeY), offsetof(ImplicitArgs, globalSizeZ)};
|
|
setGroupCountIndirect(container, groupCountOffset, implicitArgsGpuPtr);
|
|
setGlobalWorkSizeIndirect(container, globalSizeOffset, implicitArgsGpuPtr, dispatchInterface->getGroupSize());
|
|
setWorkDimIndirect(container, offsetof(ImplicitArgs, numWorkDim), implicitArgsGpuPtr, dispatchInterface->getGroupSize());
|
|
}
|
|
}
|
|
|
|
template <typename Family>
|
|
void EncodeIndirectParams<Family>::setGroupCountIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress) {
|
|
for (int i = 0; i < 3; ++i) {
|
|
if (NEO::isUndefinedOffset(offsets[i])) {
|
|
continue;
|
|
}
|
|
EncodeStoreMMIO<Family>::encode(*container.getCommandStream(), RegisterOffsets::gpgpuDispatchDim[i], ptrOffset(crossThreadAddress, offsets[i]), false, nullptr, false);
|
|
}
|
|
}
|
|
|
|
template <typename Family>
|
|
void EncodeIndirectParams<Family>::setWorkDimIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset workDimOffset, uint64_t crossThreadAddress, const uint32_t *groupSize) {
|
|
if (NEO::isValidOffset(workDimOffset)) {
|
|
auto dstPtr = ptrOffset(crossThreadAddress, workDimOffset);
|
|
constexpr uint32_t resultRegister = RegisterOffsets::csGprR0;
|
|
constexpr AluRegisters resultAluRegister = AluRegisters::gpr0;
|
|
const uint32_t offset = static_cast<uint32_t>((1ull << 8 * (dstPtr & 0b11)) - 1);
|
|
const uint32_t memoryMask = std::numeric_limits<uint32_t>::max() - static_cast<uint32_t>((1ull << 8 * ((dstPtr & 0b11) + 1)) - 1) + offset;
|
|
|
|
/*
|
|
* if ( groupSize[2] > 1 || groupCount[2] > 1 ) { workdim = 3 }
|
|
* else if ( groupSize[1] + groupCount[1] > 2 ) { workdim = 2 }
|
|
* else { workdim = 1 }
|
|
*/
|
|
|
|
if (groupSize[2] > 1) {
|
|
EncodeSetMMIO<Family>::encodeIMM(container, resultRegister, 3 << (8 * (dstPtr & 0b11)), true, false);
|
|
} else {
|
|
|
|
constexpr uint32_t groupCount2Register = RegisterOffsets::csGprR1;
|
|
constexpr AluRegisters groupCount2AluRegister = AluRegisters::gpr1;
|
|
|
|
constexpr uint32_t groupSize1Register = RegisterOffsets::csGprR0;
|
|
constexpr AluRegisters groupSize1AluRegister = AluRegisters::gpr0;
|
|
|
|
constexpr uint32_t groupCount1Register = RegisterOffsets::csGprR1;
|
|
constexpr AluRegisters groupCount1AluRegister = AluRegisters::gpr1;
|
|
|
|
constexpr AluRegisters sumAluRegister = AluRegisters::gpr0;
|
|
|
|
constexpr AluRegisters workDimEq3AluRegister = AluRegisters::gpr3;
|
|
|
|
constexpr AluRegisters workDimGe2AluRegister = AluRegisters::gpr4;
|
|
|
|
constexpr uint32_t constantOneRegister = RegisterOffsets::csGprR5;
|
|
constexpr AluRegisters constantOneAluRegister = AluRegisters::gpr5;
|
|
constexpr uint32_t constantTwoRegister = RegisterOffsets::csGprR6;
|
|
constexpr AluRegisters constantTwoAluRegister = AluRegisters::gpr6;
|
|
|
|
constexpr uint32_t backupRegister = RegisterOffsets::csGprR7;
|
|
constexpr AluRegisters backupAluRegister = AluRegisters::gpr7;
|
|
|
|
constexpr uint32_t memoryMaskRegister = RegisterOffsets::csGprR8;
|
|
constexpr AluRegisters memoryMaskAluRegister = AluRegisters::gpr8;
|
|
|
|
constexpr uint32_t offsetRegister = RegisterOffsets::csGprR8;
|
|
constexpr AluRegisters offsetAluRegister = AluRegisters::gpr8;
|
|
|
|
if (offset) {
|
|
EncodeSetMMIO<Family>::encodeMEM(container, backupRegister, dstPtr, false);
|
|
EncodeSetMMIO<Family>::encodeIMM(container, memoryMaskRegister, memoryMask, true, false);
|
|
EncodeMath<Family>::bitwiseAnd(container, memoryMaskAluRegister, backupAluRegister, backupAluRegister);
|
|
EncodeSetMMIO<Family>::encodeIMM(container, offsetRegister, offset, true, false);
|
|
}
|
|
|
|
EncodeSetMMIO<Family>::encodeIMM(container, constantOneRegister, 1, true, false);
|
|
EncodeSetMMIO<Family>::encodeIMM(container, constantTwoRegister, 2, true, false);
|
|
|
|
EncodeSetMMIO<Family>::encodeREG(container, groupCount2Register, RegisterOffsets::gpgpuDispatchDim[2], false);
|
|
|
|
EncodeMath<Family>::greaterThan(container, groupCount2AluRegister, constantOneAluRegister, workDimEq3AluRegister);
|
|
EncodeMath<Family>::bitwiseAnd(container, workDimEq3AluRegister, constantOneAluRegister, workDimEq3AluRegister);
|
|
|
|
EncodeSetMMIO<Family>::encodeIMM(container, groupSize1Register, groupSize[1], true, false);
|
|
EncodeSetMMIO<Family>::encodeREG(container, groupCount1Register, RegisterOffsets::gpgpuDispatchDim[1], false);
|
|
|
|
EncodeMath<Family>::addition(container, groupSize1AluRegister, groupCount1AluRegister, sumAluRegister);
|
|
EncodeMath<Family>::addition(container, sumAluRegister, workDimEq3AluRegister, sumAluRegister);
|
|
EncodeMath<Family>::greaterThan(container, sumAluRegister, constantTwoAluRegister, workDimGe2AluRegister);
|
|
EncodeMath<Family>::bitwiseAnd(container, workDimGe2AluRegister, constantOneAluRegister, workDimGe2AluRegister);
|
|
|
|
if (offset) {
|
|
EncodeMath<Family>::addition(container, constantOneAluRegister, offsetAluRegister, constantOneAluRegister);
|
|
EncodeMath<Family>::addition(container, workDimEq3AluRegister, offsetAluRegister, workDimEq3AluRegister);
|
|
EncodeMath<Family>::bitwiseAnd(container, workDimEq3AluRegister, constantOneAluRegister, workDimEq3AluRegister);
|
|
EncodeMath<Family>::addition(container, workDimGe2AluRegister, offsetAluRegister, workDimGe2AluRegister);
|
|
EncodeMath<Family>::bitwiseAnd(container, workDimGe2AluRegister, constantOneAluRegister, workDimGe2AluRegister);
|
|
}
|
|
|
|
EncodeSetMMIO<Family>::encodeREG(container, resultRegister, constantOneRegister, false);
|
|
EncodeMath<Family>::addition(container, resultAluRegister, workDimGe2AluRegister, resultAluRegister);
|
|
EncodeMath<Family>::addition(container, resultAluRegister, workDimEq3AluRegister, resultAluRegister);
|
|
|
|
if (offset) {
|
|
EncodeMath<Family>::addition(container, resultAluRegister, backupAluRegister, resultAluRegister);
|
|
}
|
|
}
|
|
EncodeStoreMMIO<Family>::encode(*container.getCommandStream(), resultRegister, dstPtr, false, nullptr, false);
|
|
}
|
|
}
|
|
|
|
template <typename Family>
|
|
bool EncodeSurfaceState<Family>::doBindingTablePrefetch() {
|
|
auto enableBindingTablePrefetech = isBindingTablePrefetchPreferred();
|
|
if (debugManager.flags.ForceBtpPrefetchMode.get() != -1) {
|
|
enableBindingTablePrefetech = static_cast<bool>(debugManager.flags.ForceBtpPrefetchMode.get());
|
|
}
|
|
return enableBindingTablePrefetech;
|
|
}
|
|
|
|
template <typename Family>
|
|
void EncodeSurfaceState<Family>::setPitchForScratch(R_SURFACE_STATE *surfaceState, uint32_t pitch, const ProductHelper &productHelper) {
|
|
surfaceState->setSurfacePitch(pitch);
|
|
}
|
|
|
|
template <typename Family>
|
|
uint32_t EncodeSurfaceState<Family>::getPitchForScratchInBytes(R_SURFACE_STATE *surfaceState, const ProductHelper &productHelper) {
|
|
return surfaceState->getSurfacePitch();
|
|
}
|
|
|
|
template <typename Family>
|
|
void EncodeDispatchKernel<Family>::adjustBindingTablePrefetch(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t samplerCount, uint32_t bindingTableEntryCount) {
|
|
auto enablePrefetch = EncodeSurfaceState<Family>::doBindingTablePrefetch();
|
|
|
|
if (enablePrefetch) {
|
|
interfaceDescriptor.setSamplerCount(static_cast<typename INTERFACE_DESCRIPTOR_DATA::SAMPLER_COUNT>((samplerCount + 3) / 4));
|
|
interfaceDescriptor.setBindingTableEntryCount(std::min(bindingTableEntryCount, 31u));
|
|
} else {
|
|
interfaceDescriptor.setSamplerCount(INTERFACE_DESCRIPTOR_DATA::SAMPLER_COUNT::SAMPLER_COUNT_NO_SAMPLERS_USED);
|
|
interfaceDescriptor.setBindingTableEntryCount(0u);
|
|
}
|
|
}
|
|
|
|
template <typename Family>
|
|
template <typename WalkerType, typename InterfaceDescriptorType>
|
|
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t grfCount, WalkerType &walkerCmd) {}
|
|
|
|
template <typename Family>
|
|
size_t EncodeDispatchKernel<Family>::getSizeRequiredDsh(const KernelDescriptor &kernelDescriptor, uint32_t iddCount) {
|
|
using INTERFACE_DESCRIPTOR_DATA = typename Family::INTERFACE_DESCRIPTOR_DATA;
|
|
constexpr auto samplerStateSize = sizeof(typename Family::SAMPLER_STATE);
|
|
const auto numSamplers = kernelDescriptor.payloadMappings.samplerTable.numSamplers;
|
|
const auto additionalDshSize = additionalSizeRequiredDsh(iddCount);
|
|
if (numSamplers == 0U) {
|
|
return alignUp(additionalDshSize, EncodeDispatchKernel<Family>::getDefaultDshAlignment());
|
|
}
|
|
|
|
size_t size = kernelDescriptor.payloadMappings.samplerTable.tableOffset -
|
|
kernelDescriptor.payloadMappings.samplerTable.borderColor;
|
|
size = alignUp(size, EncodeDispatchKernel<Family>::getDefaultDshAlignment());
|
|
|
|
size += numSamplers * samplerStateSize;
|
|
size = alignUp(size, INTERFACE_DESCRIPTOR_DATA::SAMPLERSTATEPOINTER_ALIGN_SIZE);
|
|
|
|
if (additionalDshSize > 0) {
|
|
size = alignUp(size, EncodeStates<Family>::alignInterfaceDescriptorData);
|
|
size += additionalDshSize;
|
|
size = alignUp(size, EncodeDispatchKernel<Family>::getDefaultDshAlignment());
|
|
}
|
|
|
|
return size;
|
|
}
|
|
|
|
template <typename GfxFamily>
|
|
template <typename WalkerType, typename InterfaceDescriptorType>
|
|
void EncodeDispatchKernel<GfxFamily>::adjustInterfaceDescriptorDataForOverdispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t grfCount, WalkerType &walkerCmd) {
|
|
const auto &productHelper = device.getProductHelper();
|
|
|
|
if (productHelper.isDisableOverdispatchAvailable(hwInfo)) {
|
|
interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1);
|
|
|
|
bool adjustTGDispatchSize = true;
|
|
if (debugManager.flags.AdjustThreadGroupDispatchSize.get() != -1) {
|
|
adjustTGDispatchSize = !!debugManager.flags.AdjustThreadGroupDispatchSize.get();
|
|
}
|
|
// apply v2 algorithm only for parts where MaxSubSlicesSupported is equal to SubSliceCount
|
|
auto algorithmVersion = hwInfo.gtSystemInfo.MaxSubSlicesSupported == hwInfo.gtSystemInfo.SubSliceCount ? 2 : 1;
|
|
if (debugManager.flags.ForceThreadGroupDispatchSizeAlgorithm.get() != -1) {
|
|
algorithmVersion = debugManager.flags.ForceThreadGroupDispatchSizeAlgorithm.get();
|
|
}
|
|
|
|
if (algorithmVersion == 2) {
|
|
auto threadsPerXeCore = hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.MaxSubSlicesSupported;
|
|
if (grfCount == 256) {
|
|
threadsPerXeCore /= 2;
|
|
}
|
|
auto tgDispatchSizeSelected = 8;
|
|
uint32_t numberOfThreadsInThreadGroup = interfaceDescriptor.getNumberOfThreadsInGpgpuThreadGroup();
|
|
|
|
if (walkerCmd.getThreadGroupIdXDimension() > 1 && (walkerCmd.getThreadGroupIdYDimension() > 1 || walkerCmd.getThreadGroupIdZDimension() > 1)) {
|
|
while (walkerCmd.getThreadGroupIdXDimension() % tgDispatchSizeSelected != 0) {
|
|
tgDispatchSizeSelected /= 2;
|
|
}
|
|
} else if (walkerCmd.getThreadGroupIdYDimension() > 1 && walkerCmd.getThreadGroupIdZDimension() > 1) {
|
|
while (walkerCmd.getThreadGroupIdYDimension() % tgDispatchSizeSelected != 0) {
|
|
tgDispatchSizeSelected /= 2;
|
|
}
|
|
}
|
|
|
|
auto workgroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension();
|
|
auto tileCount = ImplicitScalingHelper::isImplicitScalingEnabled(device.getDeviceBitfield(), true) ? device.getNumSubDevices() : 1u;
|
|
|
|
// make sure we fit all xe core
|
|
while (workgroupCount / tgDispatchSizeSelected < hwInfo.gtSystemInfo.MaxSubSlicesSupported * tileCount && tgDispatchSizeSelected > 1) {
|
|
tgDispatchSizeSelected /= 2;
|
|
}
|
|
|
|
auto threadCountPerGrouping = tgDispatchSizeSelected * numberOfThreadsInThreadGroup;
|
|
// make sure we do not use more threads then present on each xe core
|
|
while (threadCountPerGrouping > threadsPerXeCore && tgDispatchSizeSelected > 1) {
|
|
tgDispatchSizeSelected /= 2;
|
|
threadCountPerGrouping /= 2;
|
|
}
|
|
|
|
if (tgDispatchSizeSelected == 8) {
|
|
interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8);
|
|
} else if (tgDispatchSizeSelected == 1) {
|
|
interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1);
|
|
} else if (tgDispatchSizeSelected == 2) {
|
|
interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2);
|
|
} else {
|
|
interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4);
|
|
}
|
|
} else {
|
|
if (adjustTGDispatchSize) {
|
|
UNRECOVERABLE_IF(grfCount == 0u);
|
|
constexpr uint32_t maxThreadsInTGForTGDispatchSize8 = 16u;
|
|
constexpr uint32_t maxThreadsInTGForTGDispatchSize4 = 32u;
|
|
auto &gfxCoreHelper = device.getGfxCoreHelper();
|
|
uint32_t availableThreadCount = gfxCoreHelper.calculateAvailableThreadCount(hwInfo, grfCount);
|
|
if (ImplicitScalingHelper::isImplicitScalingEnabled(device.getDeviceBitfield(), true)) {
|
|
const uint32_t tilesCount = device.getNumSubDevices();
|
|
availableThreadCount *= tilesCount;
|
|
}
|
|
uint32_t numberOfThreadsInThreadGroup = interfaceDescriptor.getNumberOfThreadsInGpgpuThreadGroup();
|
|
uint32_t dispatchedTotalThreadCount = numberOfThreadsInThreadGroup * threadGroupCount;
|
|
UNRECOVERABLE_IF(numberOfThreadsInThreadGroup == 0u);
|
|
auto tgDispatchSizeSelected = 1u;
|
|
|
|
if (dispatchedTotalThreadCount <= availableThreadCount) {
|
|
tgDispatchSizeSelected = 1;
|
|
} else if (numberOfThreadsInThreadGroup <= maxThreadsInTGForTGDispatchSize8) {
|
|
tgDispatchSizeSelected = 8;
|
|
} else if (numberOfThreadsInThreadGroup <= maxThreadsInTGForTGDispatchSize4) {
|
|
tgDispatchSizeSelected = 4;
|
|
} else {
|
|
tgDispatchSizeSelected = 2;
|
|
}
|
|
if (walkerCmd.getThreadGroupIdXDimension() > 1 && (walkerCmd.getThreadGroupIdYDimension() > 1 || walkerCmd.getThreadGroupIdZDimension() > 1)) {
|
|
while (walkerCmd.getThreadGroupIdXDimension() % tgDispatchSizeSelected != 0) {
|
|
tgDispatchSizeSelected /= 2;
|
|
}
|
|
} else if (walkerCmd.getThreadGroupIdYDimension() > 1 && walkerCmd.getThreadGroupIdZDimension() > 1) {
|
|
while (walkerCmd.getThreadGroupIdYDimension() % tgDispatchSizeSelected != 0) {
|
|
tgDispatchSizeSelected /= 2;
|
|
}
|
|
}
|
|
if (tgDispatchSizeSelected == 8) {
|
|
interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8);
|
|
} else if (tgDispatchSizeSelected == 1) {
|
|
interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1);
|
|
} else if (tgDispatchSizeSelected == 2) {
|
|
interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2);
|
|
} else {
|
|
interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (debugManager.flags.ForceThreadGroupDispatchSize.get() != -1) {
|
|
interfaceDescriptor.setThreadGroupDispatchSize(static_cast<typename InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE>(
|
|
debugManager.flags.ForceThreadGroupDispatchSize.get()));
|
|
}
|
|
}
|
|
|
|
template <typename Family>
|
|
size_t EncodeDispatchKernel<Family>::getSizeRequiredSsh(const KernelInfo &kernelInfo) {
|
|
size_t requiredSshSize = kernelInfo.heapInfo.surfaceStateHeapSize;
|
|
bool isBindlessKernel = NEO::KernelDescriptor ::isBindlessAddressingKernel(kernelInfo.kernelDescriptor);
|
|
if (isBindlessKernel) {
|
|
requiredSshSize = std::max(requiredSshSize, kernelInfo.kernelDescriptor.kernelAttributes.numArgsStateful * sizeof(typename Family::RENDER_SURFACE_STATE));
|
|
}
|
|
requiredSshSize = alignUp(requiredSshSize, EncodeDispatchKernel<Family>::getDefaultSshAlignment());
|
|
return requiredSshSize;
|
|
}
|
|
|
|
template <typename Family>
|
|
size_t EncodeDispatchKernel<Family>::getDefaultDshAlignment() {
|
|
return Family::cacheLineSize;
|
|
}
|
|
|
|
template <typename GfxFamily>
|
|
size_t EncodeDispatchKernel<GfxFamily>::getScratchPtrOffsetOfImplicitArgs() {
|
|
return 0;
|
|
}
|
|
|
|
template <typename Family>
|
|
template <bool isHeapless>
|
|
void EncodeDispatchKernel<Family>::setScratchAddress(uint64_t &scratchAddress, uint32_t requiredScratchSlot0Size, uint32_t requiredScratchSlot1Size, IndirectHeap *ssh, CommandStreamReceiver &submissionCsr) {
|
|
}
|
|
|
|
template <typename Family>
|
|
void EncodeIndirectParams<Family>::setGlobalWorkSizeIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress, const uint32_t *lws) {
|
|
for (int i = 0; i < 3; ++i) {
|
|
if (NEO::isUndefinedOffset(offsets[i])) {
|
|
continue;
|
|
}
|
|
EncodeMathMMIO<Family>::encodeMulRegVal(container, RegisterOffsets::gpgpuDispatchDim[i], lws[i], ptrOffset(crossThreadAddress, offsets[i]), false);
|
|
}
|
|
}
|
|
|
|
template <typename Family>
|
|
inline size_t EncodeIndirectParams<Family>::getCmdsSizeForSetWorkDimIndirect(const uint32_t *groupSize, bool misaligedPtr) {
|
|
constexpr uint32_t aluCmdSize = sizeof(MI_MATH) + sizeof(MI_MATH_ALU_INST_INLINE) * RegisterConstants::numAluInstForReadModifyWrite;
|
|
auto requiredSize = sizeof(MI_STORE_REGISTER_MEM) + sizeof(MI_LOAD_REGISTER_IMM);
|
|
UNRECOVERABLE_IF(!groupSize);
|
|
if (groupSize[2] < 2) {
|
|
requiredSize += 2 * sizeof(MI_LOAD_REGISTER_IMM) + 3 * sizeof(MI_LOAD_REGISTER_REG) + 8 * aluCmdSize;
|
|
if (misaligedPtr) {
|
|
requiredSize += 2 * sizeof(MI_LOAD_REGISTER_IMM) + sizeof(MI_LOAD_REGISTER_MEM) + 7 * aluCmdSize;
|
|
}
|
|
}
|
|
return requiredSize;
|
|
}
|
|
|
|
template <typename Family>
|
|
void EncodeSemaphore<Family>::appendSemaphoreCommand(MI_SEMAPHORE_WAIT &cmd, uint64_t compareData, bool indirect, bool useQwordData, bool switchOnUnsuccessful) {
|
|
constexpr uint64_t upper32b = static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()) << 32;
|
|
UNRECOVERABLE_IF(useQwordData || (compareData & upper32b));
|
|
}
|
|
|
|
template <typename Family>
|
|
void EncodeSemaphore<Family>::addMiSemaphoreWaitCommand(LinearStream &commandStream,
|
|
uint64_t compareAddress,
|
|
uint64_t compareData,
|
|
COMPARE_OPERATION compareMode,
|
|
bool registerPollMode,
|
|
bool useQwordData,
|
|
bool indirect,
|
|
bool switchOnUnsuccessful,
|
|
void **outSemWaitCmd) {
|
|
auto semaphoreCommand = commandStream.getSpaceForCmd<MI_SEMAPHORE_WAIT>();
|
|
if (outSemWaitCmd != nullptr) {
|
|
*outSemWaitCmd = semaphoreCommand;
|
|
}
|
|
programMiSemaphoreWait(semaphoreCommand, compareAddress, compareData, compareMode, registerPollMode, true, useQwordData, indirect, switchOnUnsuccessful);
|
|
}
|
|
template <typename Family>
|
|
void EncodeSemaphore<Family>::applyMiSemaphoreWaitCommand(LinearStream &commandStream, std::list<void *> &commandsList) {
|
|
MI_SEMAPHORE_WAIT *semaphoreCommand = commandStream.getSpaceForCmd<MI_SEMAPHORE_WAIT>();
|
|
commandsList.push_back(semaphoreCommand);
|
|
}
|
|
|
|
template <typename Family>
|
|
inline void EncodeAtomic<Family>::setMiAtomicAddress(MI_ATOMIC &atomic, uint64_t writeAddress) {
|
|
atomic.setMemoryAddress(static_cast<uint32_t>(writeAddress & 0x0000FFFFFFFFULL));
|
|
atomic.setMemoryAddressHigh(static_cast<uint32_t>(writeAddress >> 32));
|
|
}
|
|
|
|
template <typename Family>
|
|
void EncodeAtomic<Family>::programMiAtomic(MI_ATOMIC *atomic,
|
|
uint64_t writeAddress,
|
|
ATOMIC_OPCODES opcode,
|
|
DATA_SIZE dataSize,
|
|
uint32_t returnDataControl,
|
|
uint32_t csStall,
|
|
uint64_t operand1Data,
|
|
uint64_t operand2Data) {
|
|
MI_ATOMIC cmd = Family::cmdInitAtomic;
|
|
cmd.setAtomicOpcode(opcode);
|
|
cmd.setDataSize(dataSize);
|
|
EncodeAtomic<Family>::setMiAtomicAddress(cmd, writeAddress);
|
|
cmd.setReturnDataControl(returnDataControl);
|
|
cmd.setCsStall(csStall);
|
|
|
|
if (opcode == ATOMIC_OPCODES::ATOMIC_4B_MOVE || opcode == ATOMIC_OPCODES::ATOMIC_8B_MOVE || opcode == ATOMIC_OPCODES::ATOMIC_8B_CMP_WR || opcode == ATOMIC_OPCODES::ATOMIC_8B_ADD) {
|
|
cmd.setDwordLength(MI_ATOMIC::DWORD_LENGTH::DWORD_LENGTH_INLINE_DATA_1);
|
|
cmd.setInlineData(0x1);
|
|
|
|
cmd.setOperand1DataDword0(getLowPart(operand1Data));
|
|
cmd.setOperand1DataDword1(getHighPart(operand1Data));
|
|
|
|
cmd.setOperand2DataDword0(getLowPart(operand2Data));
|
|
cmd.setOperand2DataDword1(getHighPart(operand2Data));
|
|
}
|
|
|
|
*atomic = cmd;
|
|
}
|
|
|
|
template <typename Family>
|
|
void EncodeAtomic<Family>::programMiAtomic(LinearStream &commandStream,
|
|
uint64_t writeAddress,
|
|
ATOMIC_OPCODES opcode,
|
|
DATA_SIZE dataSize,
|
|
uint32_t returnDataControl,
|
|
uint32_t csStall,
|
|
uint64_t operand1Data,
|
|
uint64_t operand2Data) {
|
|
auto miAtomic = commandStream.getSpaceForCmd<MI_ATOMIC>();
|
|
EncodeAtomic<Family>::programMiAtomic(miAtomic, writeAddress, opcode, dataSize, returnDataControl, csStall, operand1Data, operand2Data);
|
|
}
|
|
|
|
template <typename Family>
|
|
void EncodeBatchBufferStartOrEnd<Family>::programConditionalDataMemBatchBufferStart(LinearStream &commandStream, uint64_t startAddress, uint64_t compareAddress,
|
|
uint64_t compareData, CompareOperation compareOperation, bool indirect, bool useQwordData, bool isBcs) {
|
|
EncodeSetMMIO<Family>::encodeMEM(commandStream, RegisterOffsets::csGprR7, compareAddress, isBcs);
|
|
|
|
if (useQwordData) {
|
|
EncodeSetMMIO<Family>::encodeMEM(commandStream, RegisterOffsets::csGprR7 + 4, compareAddress + 4, isBcs);
|
|
} else {
|
|
LriHelper<Family>::program(&commandStream, RegisterOffsets::csGprR7 + 4, 0, true, isBcs);
|
|
}
|
|
|
|
uint32_t compareDataLow = static_cast<uint32_t>(compareData & std::numeric_limits<uint32_t>::max());
|
|
uint32_t compareDataHigh = useQwordData ? static_cast<uint32_t>(compareData >> 32) : 0;
|
|
|
|
LriHelper<Family>::program(&commandStream, RegisterOffsets::csGprR8, compareDataLow, true, isBcs);
|
|
LriHelper<Family>::program(&commandStream, RegisterOffsets::csGprR8 + 4, compareDataHigh, true, isBcs);
|
|
|
|
programConditionalBatchBufferStartBase(commandStream, startAddress, AluRegisters::gpr7, AluRegisters::gpr8, compareOperation, indirect, isBcs);
|
|
}
|
|
|
|
template <typename Family>
|
|
void EncodeBatchBufferStartOrEnd<Family>::programConditionalDataRegBatchBufferStart(LinearStream &commandStream, uint64_t startAddress, uint32_t compareReg,
|
|
uint64_t compareData, CompareOperation compareOperation, bool indirect, bool useQwordData, bool isBcs) {
|
|
EncodeSetMMIO<Family>::encodeREG(commandStream, RegisterOffsets::csGprR7, compareReg, isBcs);
|
|
if (useQwordData) {
|
|
EncodeSetMMIO<Family>::encodeREG(commandStream, RegisterOffsets::csGprR7 + 4, compareReg + 4, isBcs);
|
|
} else {
|
|
LriHelper<Family>::program(&commandStream, RegisterOffsets::csGprR7 + 4, 0, true, isBcs);
|
|
}
|
|
|
|
uint32_t compareDataLow = static_cast<uint32_t>(compareData & std::numeric_limits<uint32_t>::max());
|
|
uint32_t compareDataHigh = useQwordData ? static_cast<uint32_t>(compareData >> 32) : 0;
|
|
|
|
LriHelper<Family>::program(&commandStream, RegisterOffsets::csGprR8, compareDataLow, true, isBcs);
|
|
LriHelper<Family>::program(&commandStream, RegisterOffsets::csGprR8 + 4, compareDataHigh, true, isBcs);
|
|
|
|
programConditionalBatchBufferStartBase(commandStream, startAddress, AluRegisters::gpr7, AluRegisters::gpr8, compareOperation, indirect, isBcs);
|
|
}
|
|
|
|
template <typename Family>
|
|
void EncodeBatchBufferStartOrEnd<Family>::programConditionalRegRegBatchBufferStart(LinearStream &commandStream, uint64_t startAddress, AluRegisters compareReg0,
|
|
AluRegisters compareReg1, CompareOperation compareOperation, bool indirect, bool isBcs) {
|
|
|
|
programConditionalBatchBufferStartBase(commandStream, startAddress, compareReg0, compareReg1, compareOperation, indirect, isBcs);
|
|
}
|
|
|
|
template <typename Family>
|
|
void EncodeBatchBufferStartOrEnd<Family>::programConditionalRegMemBatchBufferStart(LinearStream &commandStream, uint64_t startAddress, uint64_t compareAddress, uint32_t compareReg,
|
|
CompareOperation compareOperation, bool indirect, bool isBcs) {
|
|
EncodeSetMMIO<Family>::encodeMEM(commandStream, RegisterOffsets::csGprR7, compareAddress, isBcs);
|
|
LriHelper<Family>::program(&commandStream, RegisterOffsets::csGprR7 + 4, 0, true, isBcs);
|
|
|
|
EncodeSetMMIO<Family>::encodeREG(commandStream, RegisterOffsets::csGprR8, compareReg, isBcs);
|
|
LriHelper<Family>::program(&commandStream, RegisterOffsets::csGprR8 + 4, 0, true, isBcs);
|
|
|
|
programConditionalBatchBufferStartBase(commandStream, startAddress, AluRegisters::gpr7, AluRegisters::gpr8, compareOperation, indirect, isBcs);
|
|
}
|
|
|
|
template <typename Family>
|
|
void EncodeBatchBufferStartOrEnd<Family>::programConditionalBatchBufferStartBase(LinearStream &commandStream, uint64_t startAddress, AluRegisters regA, AluRegisters regB,
|
|
CompareOperation compareOperation, bool indirect, bool isBcs) {
|
|
EncodeAluHelper<Family, 4> aluHelper;
|
|
aluHelper.setNextAlu(AluRegisters::opcodeLoad, AluRegisters::srca, regA);
|
|
aluHelper.setNextAlu(AluRegisters::opcodeLoad, AluRegisters::srcb, regB);
|
|
aluHelper.setNextAlu(AluRegisters::opcodeSub);
|
|
|
|
if ((compareOperation == CompareOperation::equal) || (compareOperation == CompareOperation::notEqual)) {
|
|
aluHelper.setNextAlu(AluRegisters::opcodeStore, AluRegisters::gpr7, AluRegisters::zf);
|
|
} else if ((compareOperation == CompareOperation::greaterOrEqual) || (compareOperation == CompareOperation::less)) {
|
|
aluHelper.setNextAlu(AluRegisters::opcodeStore, AluRegisters::gpr7, AluRegisters::cf);
|
|
} else {
|
|
UNRECOVERABLE_IF(true);
|
|
}
|
|
|
|
aluHelper.copyToCmdStream(commandStream);
|
|
|
|
EncodeSetMMIO<Family>::encodeREG(commandStream, RegisterOffsets::csPredicateResult2, RegisterOffsets::csGprR7, isBcs);
|
|
|
|
MiPredicateType predicateType = MiPredicateType::noopOnResult2Clear; // Equal or Less
|
|
if ((compareOperation == CompareOperation::notEqual) || (compareOperation == CompareOperation::greaterOrEqual)) {
|
|
predicateType = MiPredicateType::noopOnResult2Set;
|
|
}
|
|
|
|
EncodeMiPredicate<Family>::encode(commandStream, predicateType);
|
|
|
|
programBatchBufferStart(&commandStream, startAddress, false, indirect, true);
|
|
|
|
EncodeMiPredicate<Family>::encode(commandStream, MiPredicateType::disable);
|
|
}
|
|
|
|
template <typename Family>
|
|
void EncodeBatchBufferStartOrEnd<Family>::programBatchBufferStart(MI_BATCH_BUFFER_START *cmdBuffer, uint64_t address, bool secondLevel, bool indirect, bool predicate) {
|
|
MI_BATCH_BUFFER_START cmd = Family::cmdInitBatchBufferStart;
|
|
if (secondLevel) {
|
|
cmd.setSecondLevelBatchBuffer(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER_SECOND_LEVEL_BATCH);
|
|
}
|
|
cmd.setAddressSpaceIndicator(MI_BATCH_BUFFER_START::ADDRESS_SPACE_INDICATOR_PPGTT);
|
|
cmd.setBatchBufferStartAddress(address);
|
|
|
|
appendBatchBufferStart(cmd, indirect, predicate);
|
|
|
|
*cmdBuffer = cmd;
|
|
}
|
|
|
|
template <typename Family>
|
|
void EncodeBatchBufferStartOrEnd<Family>::programBatchBufferStart(LinearStream *commandStream, uint64_t address, bool secondLevel, bool indirect, bool predicate) {
|
|
programBatchBufferStart(commandStream->getSpaceForCmd<MI_BATCH_BUFFER_START>(), address, secondLevel, indirect, predicate);
|
|
}
|
|
|
|
template <typename Family>
|
|
void EncodeBatchBufferStartOrEnd<Family>::programBatchBufferEnd(LinearStream &commandStream) {
|
|
MI_BATCH_BUFFER_END cmd = Family::cmdInitBatchBufferEnd;
|
|
auto buffer = commandStream.getSpaceForCmd<MI_BATCH_BUFFER_END>();
|
|
*buffer = cmd;
|
|
}
|
|
|
|
template <typename Family>
|
|
void EncodeBatchBufferStartOrEnd<Family>::programBatchBufferEnd(CommandContainer &container) {
|
|
programBatchBufferEnd(*container.getCommandStream());
|
|
}
|
|
|
|
template <typename GfxFamily>
|
|
void EncodeMiFlushDW<GfxFamily>::appendWa(LinearStream &commandStream, MiFlushArgs &args) {
|
|
BlitCommandsHelper<GfxFamily>::dispatchDummyBlit(commandStream, args.waArgs);
|
|
}
|
|
|
|
template <typename Family>
|
|
void EncodeMiFlushDW<Family>::programWithWa(LinearStream &commandStream, uint64_t immediateDataGpuAddress, uint64_t immediateData,
|
|
MiFlushArgs &args) {
|
|
UNRECOVERABLE_IF(args.waArgs.isWaRequired && !args.commandWithPostSync);
|
|
appendWa(commandStream, args);
|
|
args.waArgs.isWaRequired = false;
|
|
|
|
auto miFlushDwCmd = commandStream.getSpaceForCmd<MI_FLUSH_DW>();
|
|
MI_FLUSH_DW miFlush = Family::cmdInitMiFlushDw;
|
|
if (args.commandWithPostSync) {
|
|
auto postSyncType = args.timeStampOperation ? MI_FLUSH_DW::POST_SYNC_OPERATION_WRITE_TIMESTAMP_REGISTER : MI_FLUSH_DW::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA_QWORD;
|
|
miFlush.setPostSyncOperation(postSyncType);
|
|
miFlush.setDestinationAddress(immediateDataGpuAddress);
|
|
miFlush.setImmediateData(immediateData);
|
|
}
|
|
miFlush.setNotifyEnable(args.notifyEnable);
|
|
miFlush.setTlbInvalidate(args.tlbFlush);
|
|
adjust(&miFlush, args.waArgs.rootDeviceEnvironment->getProductHelper());
|
|
*miFlushDwCmd = miFlush;
|
|
}
|
|
|
|
template <typename Family>
|
|
size_t EncodeMiFlushDW<Family>::getWaSize(const EncodeDummyBlitWaArgs &waArgs) {
|
|
return BlitCommandsHelper<Family>::getDummyBlitSize(waArgs);
|
|
}
|
|
|
|
template <typename Family>
|
|
size_t EncodeMiFlushDW<Family>::getCommandSizeWithWa(const EncodeDummyBlitWaArgs &waArgs) {
|
|
return sizeof(typename Family::MI_FLUSH_DW) + EncodeMiFlushDW<Family>::getWaSize(waArgs);
|
|
}
|
|
|
|
template <typename Family>
|
|
inline void EncodeMemoryPrefetch<Family>::programMemoryPrefetch(LinearStream &commandStream, const GraphicsAllocation &graphicsAllocation, uint32_t size, size_t offset, const RootDeviceEnvironment &rootDeviceEnvironment) {}
|
|
|
|
template <typename Family>
|
|
inline size_t EncodeMemoryPrefetch<Family>::getSizeForMemoryPrefetch(size_t size, const RootDeviceEnvironment &rootDeviceEnvironment) { return 0u; }
|
|
|
|
template <typename Family>
|
|
void EncodeMiArbCheck<Family>::program(LinearStream &commandStream, std::optional<bool> preParserDisable) {
|
|
MI_ARB_CHECK cmd = Family::cmdInitArbCheck;
|
|
|
|
EncodeMiArbCheck<Family>::adjust(cmd, preParserDisable);
|
|
auto miArbCheckStream = commandStream.getSpaceForCmd<MI_ARB_CHECK>();
|
|
*miArbCheckStream = cmd;
|
|
}
|
|
|
|
template <typename Family>
|
|
size_t EncodeMiArbCheck<Family>::getCommandSize() { return sizeof(MI_ARB_CHECK); }
|
|
|
|
template <typename Family>
|
|
inline void EncodeNoop<Family>::alignToCacheLine(LinearStream &commandStream) {
|
|
auto used = commandStream.getUsed();
|
|
auto alignment = MemoryConstants::cacheLineSize;
|
|
auto partialCacheline = used & (alignment - 1);
|
|
if (partialCacheline) {
|
|
auto amountToPad = alignment - partialCacheline;
|
|
auto pCmd = commandStream.getSpace(amountToPad);
|
|
memset(pCmd, 0, amountToPad);
|
|
}
|
|
}
|
|
|
|
template <typename Family>
|
|
inline void EncodeNoop<Family>::emitNoop(LinearStream &commandStream, size_t bytesToUpdate) {
|
|
if (bytesToUpdate) {
|
|
auto ptr = commandStream.getSpace(bytesToUpdate);
|
|
memset(ptr, 0, bytesToUpdate);
|
|
}
|
|
}
|
|
|
|
template <typename Family>
|
|
inline void EncodeStoreMemory<Family>::programStoreDataImm(LinearStream &commandStream,
|
|
uint64_t gpuAddress,
|
|
uint32_t dataDword0,
|
|
uint32_t dataDword1,
|
|
bool storeQword,
|
|
bool workloadPartitionOffset,
|
|
void **outCmdPtr) {
|
|
auto miStoreDataImmBuffer = commandStream.getSpaceForCmd<MI_STORE_DATA_IMM>();
|
|
if (outCmdPtr != nullptr) {
|
|
*outCmdPtr = miStoreDataImmBuffer;
|
|
}
|
|
EncodeStoreMemory<Family>::programStoreDataImm(miStoreDataImmBuffer,
|
|
gpuAddress,
|
|
dataDword0,
|
|
dataDword1,
|
|
storeQword,
|
|
workloadPartitionOffset);
|
|
}
|
|
|
|
template <typename GfxFamily>
|
|
void EncodeEnableRayTracing<GfxFamily>::append3dStateBtd(void *ptr3dStateBtd) {}
|
|
|
|
template <typename GfxFamily>
|
|
inline void EncodeWA<GfxFamily>::setAdditionalPipeControlFlagsForNonPipelineStateCommand(PipeControlArgs &args) {}
|
|
|
|
template <typename Family>
|
|
size_t EncodeMemoryFence<Family>::getSystemMemoryFenceSize() {
|
|
return 0;
|
|
}
|
|
|
|
template <typename Family>
|
|
void EncodeMemoryFence<Family>::encodeSystemMemoryFence(LinearStream &commandStream, const GraphicsAllocation *globalFenceAllocation) {
|
|
}
|
|
|
|
template <typename Family>
|
|
void EncodeMiPredicate<Family>::encode(LinearStream &cmdStream, [[maybe_unused]] MiPredicateType predicateType) {
|
|
if constexpr (Family::isUsingMiSetPredicate) {
|
|
using MI_SET_PREDICATE = typename Family::MI_SET_PREDICATE;
|
|
using PREDICATE_ENABLE = typename MI_SET_PREDICATE::PREDICATE_ENABLE;
|
|
|
|
auto miSetPredicate = Family::cmdInitSetPredicate;
|
|
miSetPredicate.setPredicateEnable(static_cast<PREDICATE_ENABLE>(predicateType));
|
|
|
|
*cmdStream.getSpaceForCmd<MI_SET_PREDICATE>() = miSetPredicate;
|
|
}
|
|
}
|
|
|
|
template <typename Family>
|
|
void EnodeUserInterrupt<Family>::encode(LinearStream &commandStream) {
|
|
*commandStream.getSpaceForCmd<typename Family::MI_USER_INTERRUPT>() = Family::cmdInitUserInterrupt;
|
|
}
|
|
|
|
} // namespace NEO
|