fix: patching payload arguments in inline data in case of indirect kernel
Related-To: NEO-14532 Signed-off-by: Mateusz Jablonski <mateusz.jablonski@intel.com>
This commit is contained in:
parent
95e0244f70
commit
bb518adf34
|
@ -312,7 +312,6 @@ struct CommandListCoreFamily : public CommandListImp {
|
||||||
void applyMemoryRangesBarrier(uint32_t numRanges, const size_t *pRangeSizes,
|
void applyMemoryRangesBarrier(uint32_t numRanges, const size_t *pRangeSizes,
|
||||||
const void **pRanges);
|
const void **pRanges);
|
||||||
|
|
||||||
ze_result_t setGlobalWorkSizeIndirect(NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress, uint32_t lws[3]);
|
|
||||||
ze_result_t programSyncBuffer(Kernel &kernel, NEO::Device &device, const ze_group_count_t &threadGroupDimensions, size_t &patchIndex);
|
ze_result_t programSyncBuffer(Kernel &kernel, NEO::Device &device, const ze_group_count_t &threadGroupDimensions, size_t &patchIndex);
|
||||||
void programRegionGroupBarrier(Kernel &kernel, const ze_group_count_t &threadGroupDimensions, size_t localRegionSize, size_t &patchIndex);
|
void programRegionGroupBarrier(Kernel &kernel, const ze_group_count_t &threadGroupDimensions, size_t localRegionSize, size_t &patchIndex);
|
||||||
void appendWriteKernelTimestamp(Event *event, CommandToPatchContainer *outTimeStampSyncCmds, bool beforeWalker, bool maskLsb, bool workloadPartition, bool copyOperation);
|
void appendWriteKernelTimestamp(Event *event, CommandToPatchContainer *outTimeStampSyncCmds, bool beforeWalker, bool maskLsb, bool workloadPartition, bool copyOperation);
|
||||||
|
|
|
@ -3637,13 +3637,6 @@ inline bool CommandListCoreFamily<gfxCoreFamily>::isAppendSplitNeeded(NEO::Memor
|
||||||
directionOut != NEO::TransferDirection::localToLocal;
|
directionOut != NEO::TransferDirection::localToLocal;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
|
||||||
ze_result_t CommandListCoreFamily<gfxCoreFamily>::setGlobalWorkSizeIndirect(NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress, uint32_t lws[3]) {
|
|
||||||
NEO::EncodeIndirectParams<GfxFamily>::setGlobalWorkSizeIndirect(commandContainer, offsets, crossThreadAddress, lws);
|
|
||||||
|
|
||||||
return ZE_RESULT_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||||
void CommandListCoreFamily<gfxCoreFamily>::programStateBaseAddress(NEO::CommandContainer &container, bool useSbaProperties) {
|
void CommandListCoreFamily<gfxCoreFamily>::programStateBaseAddress(NEO::CommandContainer &container, bool useSbaProperties) {
|
||||||
using STATE_BASE_ADDRESS = typename GfxFamily::STATE_BASE_ADDRESS;
|
using STATE_BASE_ADDRESS = typename GfxFamily::STATE_BASE_ADDRESS;
|
||||||
|
|
|
@ -1183,7 +1183,7 @@ HWTEST_F(CmdlistAppendLaunchKernelTests, whenEncodingWorkDimForIndirectDispatchT
|
||||||
uint32_t groupSize[] = {1, 1, 1};
|
uint32_t groupSize[] = {1, 1, 1};
|
||||||
auto estimate = EncodeIndirectParams<FamilyType>::getCmdsSizeForSetWorkDimIndirect(groupSize, false);
|
auto estimate = EncodeIndirectParams<FamilyType>::getCmdsSizeForSetWorkDimIndirect(groupSize, false);
|
||||||
auto sizeBefore = commandList->getCmdContainer().getCommandStream()->getUsed();
|
auto sizeBefore = commandList->getCmdContainer().getCommandStream()->getUsed();
|
||||||
EncodeIndirectParams<FamilyType>::setWorkDimIndirect(commandList->getCmdContainer(), 0x4, 0u, groupSize);
|
EncodeIndirectParams<FamilyType>::setWorkDimIndirect(commandList->getCmdContainer(), 0x4, 0u, groupSize, nullptr);
|
||||||
auto sizeAfter = commandList->getCmdContainer().getCommandStream()->getUsed();
|
auto sizeAfter = commandList->getCmdContainer().getCommandStream()->getUsed();
|
||||||
EXPECT_LE(sizeAfter - sizeBefore, estimate);
|
EXPECT_LE(sizeAfter - sizeBefore, estimate);
|
||||||
}
|
}
|
||||||
|
@ -1191,7 +1191,7 @@ HWTEST_F(CmdlistAppendLaunchKernelTests, whenEncodingWorkDimForIndirectDispatchT
|
||||||
uint32_t groupSize[] = {1, 1, 2};
|
uint32_t groupSize[] = {1, 1, 2};
|
||||||
auto estimate = EncodeIndirectParams<FamilyType>::getCmdsSizeForSetWorkDimIndirect(groupSize, false);
|
auto estimate = EncodeIndirectParams<FamilyType>::getCmdsSizeForSetWorkDimIndirect(groupSize, false);
|
||||||
auto sizeBefore = commandList->getCmdContainer().getCommandStream()->getUsed();
|
auto sizeBefore = commandList->getCmdContainer().getCommandStream()->getUsed();
|
||||||
EncodeIndirectParams<FamilyType>::setWorkDimIndirect(commandList->getCmdContainer(), 0x4, 0u, groupSize);
|
EncodeIndirectParams<FamilyType>::setWorkDimIndirect(commandList->getCmdContainer(), 0x4, 0u, groupSize, nullptr);
|
||||||
auto sizeAfter = commandList->getCmdContainer().getCommandStream()->getUsed();
|
auto sizeAfter = commandList->getCmdContainer().getCommandStream()->getUsed();
|
||||||
EXPECT_LE(sizeAfter - sizeBefore, estimate);
|
EXPECT_LE(sizeAfter - sizeBefore, estimate);
|
||||||
}
|
}
|
||||||
|
@ -1199,7 +1199,7 @@ HWTEST_F(CmdlistAppendLaunchKernelTests, whenEncodingWorkDimForIndirectDispatchT
|
||||||
uint32_t groupSize[] = {1, 1, 1};
|
uint32_t groupSize[] = {1, 1, 1};
|
||||||
auto estimate = EncodeIndirectParams<FamilyType>::getCmdsSizeForSetWorkDimIndirect(groupSize, true);
|
auto estimate = EncodeIndirectParams<FamilyType>::getCmdsSizeForSetWorkDimIndirect(groupSize, true);
|
||||||
auto sizeBefore = commandList->getCmdContainer().getCommandStream()->getUsed();
|
auto sizeBefore = commandList->getCmdContainer().getCommandStream()->getUsed();
|
||||||
EncodeIndirectParams<FamilyType>::setWorkDimIndirect(commandList->getCmdContainer(), 0x2, 0u, groupSize);
|
EncodeIndirectParams<FamilyType>::setWorkDimIndirect(commandList->getCmdContainer(), 0x2, 0u, groupSize, nullptr);
|
||||||
auto sizeAfter = commandList->getCmdContainer().getCommandStream()->getUsed();
|
auto sizeAfter = commandList->getCmdContainer().getCommandStream()->getUsed();
|
||||||
EXPECT_LE(sizeAfter - sizeBefore, estimate);
|
EXPECT_LE(sizeAfter - sizeBefore, estimate);
|
||||||
}
|
}
|
||||||
|
@ -1207,7 +1207,7 @@ HWTEST_F(CmdlistAppendLaunchKernelTests, whenEncodingWorkDimForIndirectDispatchT
|
||||||
uint32_t groupSize[] = {1, 1, 2};
|
uint32_t groupSize[] = {1, 1, 2};
|
||||||
auto estimate = EncodeIndirectParams<FamilyType>::getCmdsSizeForSetWorkDimIndirect(groupSize, true);
|
auto estimate = EncodeIndirectParams<FamilyType>::getCmdsSizeForSetWorkDimIndirect(groupSize, true);
|
||||||
auto sizeBefore = commandList->getCmdContainer().getCommandStream()->getUsed();
|
auto sizeBefore = commandList->getCmdContainer().getCommandStream()->getUsed();
|
||||||
EncodeIndirectParams<FamilyType>::setWorkDimIndirect(commandList->getCmdContainer(), 0x2, 0u, groupSize);
|
EncodeIndirectParams<FamilyType>::setWorkDimIndirect(commandList->getCmdContainer(), 0x2, 0u, groupSize, nullptr);
|
||||||
auto sizeAfter = commandList->getCmdContainer().getCommandStream()->getUsed();
|
auto sizeAfter = commandList->getCmdContainer().getCommandStream()->getUsed();
|
||||||
EXPECT_LE(sizeAfter - sizeBefore, estimate);
|
EXPECT_LE(sizeAfter - sizeBefore, estimate);
|
||||||
}
|
}
|
||||||
|
|
|
@ -97,6 +97,14 @@ struct EncodeDispatchKernelArgs {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct EncodeStoreMMIOParams {
|
||||||
|
uint64_t address;
|
||||||
|
void *command;
|
||||||
|
uint32_t offset;
|
||||||
|
bool workloadPartition;
|
||||||
|
bool isBcs;
|
||||||
|
};
|
||||||
|
|
||||||
enum class MiPredicateType : uint32_t {
|
enum class MiPredicateType : uint32_t {
|
||||||
disable = 0,
|
disable = 0,
|
||||||
noopOnResult2Clear = 1,
|
noopOnResult2Clear = 1,
|
||||||
|
@ -346,7 +354,7 @@ struct EncodeMathMMIO {
|
||||||
|
|
||||||
static const size_t size = sizeof(MI_STORE_REGISTER_MEM);
|
static const size_t size = sizeof(MI_STORE_REGISTER_MEM);
|
||||||
|
|
||||||
static void encodeMulRegVal(CommandContainer &container, uint32_t offset, uint32_t val, uint64_t dstAddress, bool isBcs);
|
static void encodeMulRegVal(CommandContainer &container, uint32_t offset, uint32_t val, uint64_t dstAddress, bool isBcs, EncodeStoreMMIOParams *outStoreMMIOParams);
|
||||||
|
|
||||||
static void encodeGreaterThanPredicate(CommandContainer &container, uint64_t lhsVal, uint32_t rhsVal, bool isBcs);
|
static void encodeGreaterThanPredicate(CommandContainer &container, uint64_t lhsVal, uint32_t rhsVal, bool isBcs);
|
||||||
|
|
||||||
|
@ -387,6 +395,13 @@ struct EncodeMathMMIO {
|
||||||
static void encodeIncrementOrDecrement(LinearStream &cmdStream, AluRegisters operandRegister, IncrementOrDecrementOperation operationType, bool isBcs);
|
static void encodeIncrementOrDecrement(LinearStream &cmdStream, AluRegisters operandRegister, IncrementOrDecrementOperation operationType, bool isBcs);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct IndirectParamsInInlineDataArgs {
|
||||||
|
std::vector<EncodeStoreMMIOParams> commandsToPatch;
|
||||||
|
bool storeGroupCountInInlineData[3];
|
||||||
|
bool storeGlobalWorkSizeInInlineData[3];
|
||||||
|
bool storeWorkDimInInlineData;
|
||||||
|
};
|
||||||
|
|
||||||
template <typename GfxFamily>
|
template <typename GfxFamily>
|
||||||
struct EncodeIndirectParams {
|
struct EncodeIndirectParams {
|
||||||
using MI_LOAD_REGISTER_IMM = typename GfxFamily::MI_LOAD_REGISTER_IMM;
|
using MI_LOAD_REGISTER_IMM = typename GfxFamily::MI_LOAD_REGISTER_IMM;
|
||||||
|
@ -396,10 +411,11 @@ struct EncodeIndirectParams {
|
||||||
using MI_MATH = typename GfxFamily::MI_MATH;
|
using MI_MATH = typename GfxFamily::MI_MATH;
|
||||||
using MI_MATH_ALU_INST_INLINE = typename GfxFamily::MI_MATH_ALU_INST_INLINE;
|
using MI_MATH_ALU_INST_INLINE = typename GfxFamily::MI_MATH_ALU_INST_INLINE;
|
||||||
|
|
||||||
static void encode(CommandContainer &container, uint64_t crossThreadDataGpuVa, DispatchKernelEncoderI *dispatchInterface, uint64_t implicitArgsGpuPtr);
|
static void encode(CommandContainer &container, uint64_t crossThreadDataGpuVa, DispatchKernelEncoderI *dispatchInterface, uint64_t implicitArgsGpuPtr, IndirectParamsInInlineDataArgs *outArgs);
|
||||||
static void setGroupCountIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress);
|
static void setGroupCountIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress, IndirectParamsInInlineDataArgs *outArgs);
|
||||||
static void setWorkDimIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offset, uint64_t crossThreadAddress, const uint32_t *groupSize);
|
static void setWorkDimIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offset, uint64_t crossThreadAddress, const uint32_t *groupSize, IndirectParamsInInlineDataArgs *outArgs);
|
||||||
static void setGlobalWorkSizeIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress, const uint32_t *lws);
|
static void setGlobalWorkSizeIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress, const uint32_t *lws, IndirectParamsInInlineDataArgs *outArgs);
|
||||||
|
static void applyInlineDataGpuVA(IndirectParamsInInlineDataArgs &args, uint64_t inlineDataGpuVa);
|
||||||
|
|
||||||
static size_t getCmdsSizeForSetWorkDimIndirect(const uint32_t *groupSize, bool misalignedPtr);
|
static size_t getCmdsSizeForSetWorkDimIndirect(const uint32_t *groupSize, bool misalignedPtr);
|
||||||
};
|
};
|
||||||
|
|
|
@ -117,7 +117,7 @@ uint32_t EncodeStates<Family>::copySamplerState(IndirectHeap *dsh,
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename Family>
|
template <typename Family>
|
||||||
void EncodeMathMMIO<Family>::encodeMulRegVal(CommandContainer &container, uint32_t offset, uint32_t val, uint64_t dstAddress, bool isBcs) {
|
void EncodeMathMMIO<Family>::encodeMulRegVal(CommandContainer &container, uint32_t offset, uint32_t val, uint64_t dstAddress, bool isBcs, EncodeStoreMMIOParams *outStoreMMIOParams) {
|
||||||
int logLws = 0;
|
int logLws = 0;
|
||||||
int i = val;
|
int i = val;
|
||||||
while (val >> logLws) {
|
while (val >> logLws) {
|
||||||
|
@ -139,7 +139,15 @@ void EncodeMathMMIO<Family>::encodeMulRegVal(CommandContainer &container, uint32
|
||||||
EncodeSetMMIO<Family>::encodeREG(container, RegisterOffsets::csGprR0, RegisterOffsets::csGprR2, isBcs);
|
EncodeSetMMIO<Family>::encodeREG(container, RegisterOffsets::csGprR0, RegisterOffsets::csGprR2, isBcs);
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
EncodeStoreMMIO<Family>::encode(*container.getCommandStream(), RegisterOffsets::csGprR1, dstAddress, false, nullptr, isBcs);
|
void **outStoreMMIOCmd = nullptr;
|
||||||
|
if (outStoreMMIOParams) {
|
||||||
|
outStoreMMIOParams->address = dstAddress;
|
||||||
|
outStoreMMIOParams->offset = RegisterOffsets::csGprR1;
|
||||||
|
outStoreMMIOParams->workloadPartition = false;
|
||||||
|
outStoreMMIOParams->isBcs = isBcs;
|
||||||
|
outStoreMMIOCmd = &outStoreMMIOParams->command;
|
||||||
|
}
|
||||||
|
EncodeStoreMMIO<Family>::encode(*container.getCommandStream(), RegisterOffsets::csGprR1, dstAddress, false, outStoreMMIOCmd, isBcs);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -586,44 +594,75 @@ bool EncodeDispatchKernel<Family>::inlineDataProgrammingRequired(const KernelDes
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename Family>
|
template <typename Family>
|
||||||
void EncodeIndirectParams<Family>::encode(CommandContainer &container, uint64_t crossThreadDataGpuVa, DispatchKernelEncoderI *dispatchInterface, uint64_t implicitArgsGpuPtr) {
|
void EncodeIndirectParams<Family>::encode(CommandContainer &container, uint64_t crossThreadDataGpuVa, DispatchKernelEncoderI *dispatchInterface, uint64_t implicitArgsGpuPtr, IndirectParamsInInlineDataArgs *outArgs) {
|
||||||
const auto &kernelDescriptor = dispatchInterface->getKernelDescriptor();
|
const auto &kernelDescriptor = dispatchInterface->getKernelDescriptor();
|
||||||
setGroupCountIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups, crossThreadDataGpuVa);
|
if (outArgs) {
|
||||||
setGlobalWorkSizeIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize, crossThreadDataGpuVa, dispatchInterface->getGroupSize());
|
for (int i = 0; i < 3; i++) {
|
||||||
|
if (!NEO::isUndefinedOffset(kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups[i]) && kernelDescriptor.kernelAttributes.inlineDataPayloadSize > kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups[i]) {
|
||||||
|
outArgs->storeGroupCountInInlineData[i] = true;
|
||||||
|
}
|
||||||
|
if (!NEO::isUndefinedOffset(kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize[i]) && kernelDescriptor.kernelAttributes.inlineDataPayloadSize > kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize[i]) {
|
||||||
|
outArgs->storeGlobalWorkSizeInInlineData[i] = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!NEO::isUndefinedOffset(kernelDescriptor.payloadMappings.dispatchTraits.workDim) && kernelDescriptor.kernelAttributes.inlineDataPayloadSize > kernelDescriptor.payloadMappings.dispatchTraits.workDim) {
|
||||||
|
outArgs->storeWorkDimInInlineData = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
setGroupCountIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups, crossThreadDataGpuVa, outArgs);
|
||||||
|
setGlobalWorkSizeIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize, crossThreadDataGpuVa, dispatchInterface->getGroupSize(), outArgs);
|
||||||
UNRECOVERABLE_IF(NEO::isValidOffset(kernelDescriptor.payloadMappings.dispatchTraits.workDim) && (kernelDescriptor.payloadMappings.dispatchTraits.workDim & 0b11) != 0u);
|
UNRECOVERABLE_IF(NEO::isValidOffset(kernelDescriptor.payloadMappings.dispatchTraits.workDim) && (kernelDescriptor.payloadMappings.dispatchTraits.workDim & 0b11) != 0u);
|
||||||
setWorkDimIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.workDim, crossThreadDataGpuVa, dispatchInterface->getGroupSize());
|
setWorkDimIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.workDim, crossThreadDataGpuVa, dispatchInterface->getGroupSize(), outArgs);
|
||||||
if (implicitArgsGpuPtr) {
|
if (implicitArgsGpuPtr) {
|
||||||
const auto version = container.getDevice()->getGfxCoreHelper().getImplicitArgsVersion();
|
const auto version = container.getDevice()->getGfxCoreHelper().getImplicitArgsVersion();
|
||||||
if (version == 0) {
|
if (version == 0) {
|
||||||
constexpr CrossThreadDataOffset groupCountOffset[] = {offsetof(ImplicitArgsV0, groupCountX), offsetof(ImplicitArgsV0, groupCountY), offsetof(ImplicitArgsV0, groupCountZ)};
|
constexpr CrossThreadDataOffset groupCountOffset[] = {offsetof(ImplicitArgsV0, groupCountX), offsetof(ImplicitArgsV0, groupCountY), offsetof(ImplicitArgsV0, groupCountZ)};
|
||||||
constexpr CrossThreadDataOffset globalSizeOffset[] = {offsetof(ImplicitArgsV0, globalSizeX), offsetof(ImplicitArgsV0, globalSizeY), offsetof(ImplicitArgsV0, globalSizeZ)};
|
constexpr CrossThreadDataOffset globalSizeOffset[] = {offsetof(ImplicitArgsV0, globalSizeX), offsetof(ImplicitArgsV0, globalSizeY), offsetof(ImplicitArgsV0, globalSizeZ)};
|
||||||
constexpr auto numWorkDimOffset = offsetof(ImplicitArgsV0, numWorkDim);
|
constexpr auto numWorkDimOffset = offsetof(ImplicitArgsV0, numWorkDim);
|
||||||
setGroupCountIndirect(container, groupCountOffset, implicitArgsGpuPtr);
|
setGroupCountIndirect(container, groupCountOffset, implicitArgsGpuPtr, nullptr);
|
||||||
setGlobalWorkSizeIndirect(container, globalSizeOffset, implicitArgsGpuPtr, dispatchInterface->getGroupSize());
|
setGlobalWorkSizeIndirect(container, globalSizeOffset, implicitArgsGpuPtr, dispatchInterface->getGroupSize(), nullptr);
|
||||||
setWorkDimIndirect(container, numWorkDimOffset, implicitArgsGpuPtr, dispatchInterface->getGroupSize());
|
setWorkDimIndirect(container, numWorkDimOffset, implicitArgsGpuPtr, dispatchInterface->getGroupSize(), nullptr);
|
||||||
} else if (version == 1) {
|
} else if (version == 1) {
|
||||||
constexpr CrossThreadDataOffset groupCountOffsetV1[] = {offsetof(ImplicitArgsV1, groupCountX), offsetof(ImplicitArgsV1, groupCountY), offsetof(ImplicitArgsV1, groupCountZ)};
|
constexpr CrossThreadDataOffset groupCountOffsetV1[] = {offsetof(ImplicitArgsV1, groupCountX), offsetof(ImplicitArgsV1, groupCountY), offsetof(ImplicitArgsV1, groupCountZ)};
|
||||||
constexpr CrossThreadDataOffset globalSizeOffsetV1[] = {offsetof(ImplicitArgsV1, globalSizeX), offsetof(ImplicitArgsV1, globalSizeY), offsetof(ImplicitArgsV1, globalSizeZ)};
|
constexpr CrossThreadDataOffset globalSizeOffsetV1[] = {offsetof(ImplicitArgsV1, globalSizeX), offsetof(ImplicitArgsV1, globalSizeY), offsetof(ImplicitArgsV1, globalSizeZ)};
|
||||||
constexpr auto numWorkDimOffsetV1 = offsetof(ImplicitArgsV1, numWorkDim);
|
constexpr auto numWorkDimOffsetV1 = offsetof(ImplicitArgsV1, numWorkDim);
|
||||||
setGroupCountIndirect(container, groupCountOffsetV1, implicitArgsGpuPtr);
|
setGroupCountIndirect(container, groupCountOffsetV1, implicitArgsGpuPtr, nullptr);
|
||||||
setGlobalWorkSizeIndirect(container, globalSizeOffsetV1, implicitArgsGpuPtr, dispatchInterface->getGroupSize());
|
setGlobalWorkSizeIndirect(container, globalSizeOffsetV1, implicitArgsGpuPtr, dispatchInterface->getGroupSize(), nullptr);
|
||||||
setWorkDimIndirect(container, numWorkDimOffsetV1, implicitArgsGpuPtr, dispatchInterface->getGroupSize());
|
setWorkDimIndirect(container, numWorkDimOffsetV1, implicitArgsGpuPtr, dispatchInterface->getGroupSize(), nullptr);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename Family>
|
template <typename Family>
|
||||||
void EncodeIndirectParams<Family>::setGroupCountIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress) {
|
void EncodeIndirectParams<Family>::setGroupCountIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress, IndirectParamsInInlineDataArgs *outArgs) {
|
||||||
for (int i = 0; i < 3; ++i) {
|
for (int i = 0; i < 3; ++i) {
|
||||||
if (NEO::isUndefinedOffset(offsets[i])) {
|
if (NEO::isUndefinedOffset(offsets[i])) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
EncodeStoreMMIO<Family>::encode(*container.getCommandStream(), RegisterOffsets::gpgpuDispatchDim[i], ptrOffset(crossThreadAddress, offsets[i]), false, nullptr, false);
|
void **storeCmd = nullptr;
|
||||||
|
if (outArgs && outArgs->storeGroupCountInInlineData[i]) {
|
||||||
|
outArgs->commandsToPatch.push_back({});
|
||||||
|
auto &commandArgs = outArgs->commandsToPatch.back();
|
||||||
|
storeCmd = &commandArgs.command;
|
||||||
|
commandArgs.address = offsets[i];
|
||||||
|
commandArgs.offset = RegisterOffsets::gpgpuDispatchDim[i];
|
||||||
|
commandArgs.isBcs = false;
|
||||||
|
commandArgs.workloadPartition = false;
|
||||||
|
}
|
||||||
|
EncodeStoreMMIO<Family>::encode(*container.getCommandStream(), RegisterOffsets::gpgpuDispatchDim[i], ptrOffset(crossThreadAddress, offsets[i]), false, storeCmd, false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename Family>
|
template <typename Family>
|
||||||
void EncodeIndirectParams<Family>::setWorkDimIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset workDimOffset, uint64_t crossThreadAddress, const uint32_t *groupSize) {
|
void EncodeIndirectParams<Family>::applyInlineDataGpuVA(IndirectParamsInInlineDataArgs &args, uint64_t inlineDataGpuVa) {
|
||||||
|
for (auto &commandArgs : args.commandsToPatch) {
|
||||||
|
auto commandToPatch = reinterpret_cast<MI_STORE_REGISTER_MEM *>(commandArgs.command);
|
||||||
|
EncodeStoreMMIO<Family>::encode(commandToPatch, commandArgs.offset, commandArgs.address + inlineDataGpuVa, commandArgs.workloadPartition, commandArgs.isBcs);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Family>
|
||||||
|
void EncodeIndirectParams<Family>::setWorkDimIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset workDimOffset, uint64_t crossThreadAddress, const uint32_t *groupSize, IndirectParamsInInlineDataArgs *outArgs) {
|
||||||
if (NEO::isValidOffset(workDimOffset)) {
|
if (NEO::isValidOffset(workDimOffset)) {
|
||||||
auto dstPtr = ptrOffset(crossThreadAddress, workDimOffset);
|
auto dstPtr = ptrOffset(crossThreadAddress, workDimOffset);
|
||||||
constexpr uint32_t resultRegister = RegisterOffsets::csGprR0;
|
constexpr uint32_t resultRegister = RegisterOffsets::csGprR0;
|
||||||
|
@ -709,7 +748,17 @@ void EncodeIndirectParams<Family>::setWorkDimIndirect(CommandContainer &containe
|
||||||
EncodeMath<Family>::addition(container, resultAluRegister, backupAluRegister, resultAluRegister);
|
EncodeMath<Family>::addition(container, resultAluRegister, backupAluRegister, resultAluRegister);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
EncodeStoreMMIO<Family>::encode(*container.getCommandStream(), resultRegister, dstPtr, false, nullptr, false);
|
void **storeCmd = nullptr;
|
||||||
|
if (outArgs && outArgs->storeWorkDimInInlineData) {
|
||||||
|
outArgs->commandsToPatch.push_back({});
|
||||||
|
auto &commandArgs = outArgs->commandsToPatch.back();
|
||||||
|
storeCmd = &commandArgs.command;
|
||||||
|
commandArgs.address = workDimOffset;
|
||||||
|
commandArgs.offset = resultRegister;
|
||||||
|
commandArgs.isBcs = false;
|
||||||
|
commandArgs.workloadPartition = false;
|
||||||
|
}
|
||||||
|
EncodeStoreMMIO<Family>::encode(*container.getCommandStream(), resultRegister, dstPtr, false, storeCmd, false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -777,12 +826,20 @@ size_t EncodeDispatchKernel<Family>::getDefaultDshAlignment() {
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename Family>
|
template <typename Family>
|
||||||
void EncodeIndirectParams<Family>::setGlobalWorkSizeIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress, const uint32_t *lws) {
|
void EncodeIndirectParams<Family>::setGlobalWorkSizeIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress, const uint32_t *lws, IndirectParamsInInlineDataArgs *outArgs) {
|
||||||
for (int i = 0; i < 3; ++i) {
|
for (int i = 0; i < 3; ++i) {
|
||||||
if (NEO::isUndefinedOffset(offsets[i])) {
|
if (NEO::isUndefinedOffset(offsets[i])) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
EncodeMathMMIO<Family>::encodeMulRegVal(container, RegisterOffsets::gpgpuDispatchDim[i], lws[i], ptrOffset(crossThreadAddress, offsets[i]), false);
|
EncodeStoreMMIOParams *storeParams = nullptr;
|
||||||
|
|
||||||
|
auto patchLocation = ptrOffset(crossThreadAddress, offsets[i]);
|
||||||
|
if (outArgs && outArgs->storeGlobalWorkSizeInInlineData[i]) {
|
||||||
|
outArgs->commandsToPatch.push_back({});
|
||||||
|
storeParams = &outArgs->commandsToPatch.back();
|
||||||
|
patchLocation = offsets[i];
|
||||||
|
}
|
||||||
|
EncodeMathMMIO<Family>::encodeMulRegVal(container, RegisterOffsets::gpgpuDispatchDim[i], lws[i], patchLocation, false, storeParams);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -251,6 +251,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
||||||
uint32_t sizeForImplicitArgsPatching = NEO::ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, !localIdsGenerationByRuntime, rootDeviceEnvironment);
|
uint32_t sizeForImplicitArgsPatching = NEO::ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, !localIdsGenerationByRuntime, rootDeviceEnvironment);
|
||||||
uint32_t sizeForImplicitArgsStruct = NEO::ImplicitArgsHelper::getSizeForImplicitArgsStruct(pImplicitArgs, kernelDescriptor, true, rootDeviceEnvironment);
|
uint32_t sizeForImplicitArgsStruct = NEO::ImplicitArgsHelper::getSizeForImplicitArgsStruct(pImplicitArgs, kernelDescriptor, true, rootDeviceEnvironment);
|
||||||
uint32_t iohRequiredSize = sizeThreadData + sizeForImplicitArgsPatching + args.reserveExtraPayloadSpace;
|
uint32_t iohRequiredSize = sizeThreadData + sizeForImplicitArgsPatching + args.reserveExtraPayloadSpace;
|
||||||
|
IndirectParamsInInlineDataArgs encodeIndirectParamsArgs{};
|
||||||
{
|
{
|
||||||
void *ptr = nullptr;
|
void *ptr = nullptr;
|
||||||
if (!args.makeCommandView) {
|
if (!args.makeCommandView) {
|
||||||
|
@ -279,7 +280,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
||||||
if (pImplicitArgs) {
|
if (pImplicitArgs) {
|
||||||
implicitArgsGpuPtr = gpuPtr + inlineDataProgrammingOffset - sizeForImplicitArgsStruct;
|
implicitArgsGpuPtr = gpuPtr + inlineDataProgrammingOffset - sizeForImplicitArgsStruct;
|
||||||
}
|
}
|
||||||
EncodeIndirectParams<Family>::encode(container, gpuPtr, args.dispatchInterface, implicitArgsGpuPtr);
|
EncodeIndirectParams<Family>::encode(container, gpuPtr, args.dispatchInterface, implicitArgsGpuPtr, &encodeIndirectParamsArgs);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
ptr = args.cpuPayloadBuffer;
|
ptr = args.cpuPayloadBuffer;
|
||||||
|
@ -460,6 +461,11 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (args.isIndirect) {
|
||||||
|
auto walkerGpuVa = listCmdBufferStream->getGpuBase() + ptrDiff(args.outWalkerPtr, listCmdBufferStream->getCpuBase());
|
||||||
|
EncodeIndirectParams<Family>::applyInlineDataGpuVA(encodeIndirectParamsArgs, walkerGpuVa + ptrDiff(walkerCmd.getInlineDataPointer(), &walkerCmd));
|
||||||
|
}
|
||||||
|
|
||||||
if (args.cpuWalkerBuffer) {
|
if (args.cpuWalkerBuffer) {
|
||||||
*reinterpret_cast<WalkerType *>(args.cpuWalkerBuffer) = walkerCmd;
|
*reinterpret_cast<WalkerType *>(args.cpuWalkerBuffer) = walkerCmd;
|
||||||
}
|
}
|
||||||
|
|
|
@ -213,7 +213,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
||||||
|
|
||||||
if (args.isIndirect) {
|
if (args.isIndirect) {
|
||||||
auto crossThreadDataGpuVA = heapIndirect->getGraphicsAllocation()->getGpuAddress() + heapIndirect->getUsed() - sizeThreadData;
|
auto crossThreadDataGpuVA = heapIndirect->getGraphicsAllocation()->getGpuAddress() + heapIndirect->getUsed() - sizeThreadData;
|
||||||
EncodeIndirectParams<Family>::encode(container, crossThreadDataGpuVA, args.dispatchInterface, implicitArgsGpuVA);
|
EncodeIndirectParams<Family>::encode(container, crossThreadDataGpuVA, args.dispatchInterface, implicitArgsGpuVA, nullptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
ptr = ptrOffset(ptr, sizeCrossThreadData);
|
ptr = ptrOffset(ptr, sizeCrossThreadData);
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (C) 2020-2024 Intel Corporation
|
* Copyright (C) 2020-2025 Intel Corporation
|
||||||
*
|
*
|
||||||
* SPDX-License-Identifier: MIT
|
* SPDX-License-Identifier: MIT
|
||||||
*
|
*
|
||||||
|
@ -12,6 +12,7 @@
|
||||||
#include "shared/test/common/fixtures/device_fixture.h"
|
#include "shared/test/common/fixtures/device_fixture.h"
|
||||||
#include "shared/test/common/mocks/mock_device.h"
|
#include "shared/test/common/mocks/mock_device.h"
|
||||||
#include "shared/test/common/test_macros/hw_test.h"
|
#include "shared/test/common/test_macros/hw_test.h"
|
||||||
|
#include "shared/test/unit_test/mocks/mock_dispatch_kernel_encoder_interface.h"
|
||||||
|
|
||||||
using namespace NEO;
|
using namespace NEO;
|
||||||
|
|
||||||
|
@ -231,7 +232,7 @@ HWTEST_F(CommandEncoderMathTest, WhenSettingGroupSizeIndirectThenCommandsAreCorr
|
||||||
uint32_t crossThreadAddress[3] = {};
|
uint32_t crossThreadAddress[3] = {};
|
||||||
uint32_t lws[3] = {2, 1, 1};
|
uint32_t lws[3] = {2, 1, 1};
|
||||||
|
|
||||||
EncodeIndirectParams<FamilyType>::setGlobalWorkSizeIndirect(cmdContainer, offsets, reinterpret_cast<uint64_t>(crossThreadAddress), lws);
|
EncodeIndirectParams<FamilyType>::setGlobalWorkSizeIndirect(cmdContainer, offsets, reinterpret_cast<uint64_t>(crossThreadAddress), lws, nullptr);
|
||||||
|
|
||||||
GenCmdList commands;
|
GenCmdList commands;
|
||||||
CmdParse<FamilyType>::parseCommandBuffer(commands, ptrOffset(cmdContainer.getCommandStream()->getCpuBase(), 0), cmdContainer.getCommandStream()->getUsed());
|
CmdParse<FamilyType>::parseCommandBuffer(commands, ptrOffset(cmdContainer.getCommandStream()->getCpuBase(), 0), cmdContainer.getCommandStream()->getUsed());
|
||||||
|
@ -254,7 +255,7 @@ HWTEST_F(CommandEncoderMathTest, WhenSettingGroupCountIndirectThenCommandsAreCor
|
||||||
CrossThreadDataOffset offsets[3] = {0, sizeof(uint32_t), 2 * sizeof(uint32_t)};
|
CrossThreadDataOffset offsets[3] = {0, sizeof(uint32_t), 2 * sizeof(uint32_t)};
|
||||||
uint32_t crossThreadAddress[3] = {};
|
uint32_t crossThreadAddress[3] = {};
|
||||||
|
|
||||||
EncodeIndirectParams<FamilyType>::setGroupCountIndirect(cmdContainer, offsets, reinterpret_cast<uint64_t>(crossThreadAddress));
|
EncodeIndirectParams<FamilyType>::setGroupCountIndirect(cmdContainer, offsets, reinterpret_cast<uint64_t>(crossThreadAddress), nullptr);
|
||||||
|
|
||||||
GenCmdList commands;
|
GenCmdList commands;
|
||||||
CmdParse<FamilyType>::parseCommandBuffer(commands, ptrOffset(cmdContainer.getCommandStream()->getCpuBase(), 0), cmdContainer.getCommandStream()->getUsed());
|
CmdParse<FamilyType>::parseCommandBuffer(commands, ptrOffset(cmdContainer.getCommandStream()->getCpuBase(), 0), cmdContainer.getCommandStream()->getUsed());
|
||||||
|
@ -274,6 +275,220 @@ HWTEST_F(CommandEncoderMathTest, WhenSettingGroupCountIndirectThenCommandsAreCor
|
||||||
ASSERT_EQ(itor, commands.end());
|
ASSERT_EQ(itor, commands.end());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
HWTEST_F(CommandEncoderMathTest, givenPayloadArgumentStoredInInlineDataWhenSettingGroupCountIndirectThenInlineDataRelatedCommandIsStoredInCommandsToPatch) {
|
||||||
|
using MI_STORE_REGISTER_MEM = typename FamilyType::MI_STORE_REGISTER_MEM;
|
||||||
|
|
||||||
|
CommandContainer cmdContainer;
|
||||||
|
cmdContainer.initialize(pDevice, nullptr, HeapSize::defaultHeapSize, true, false);
|
||||||
|
|
||||||
|
CrossThreadDataOffset offsets[3] = {0, sizeof(uint32_t), 2 * sizeof(uint32_t)};
|
||||||
|
uint64_t crossThreadGpuVa = 0xBADF000;
|
||||||
|
|
||||||
|
IndirectParamsInInlineDataArgs args{};
|
||||||
|
args.storeGroupCountInInlineData[1] = true;
|
||||||
|
|
||||||
|
EncodeIndirectParams<FamilyType>::setGroupCountIndirect(cmdContainer, offsets, crossThreadGpuVa, &args);
|
||||||
|
|
||||||
|
EXPECT_EQ(1u, args.commandsToPatch.size());
|
||||||
|
|
||||||
|
GenCmdList commands;
|
||||||
|
CmdParse<FamilyType>::parseCommandBuffer(commands, ptrOffset(cmdContainer.getCommandStream()->getCpuBase(), 0), cmdContainer.getCommandStream()->getUsed());
|
||||||
|
|
||||||
|
auto itor = commands.begin();
|
||||||
|
|
||||||
|
itor = find<MI_STORE_REGISTER_MEM *>(itor, commands.end());
|
||||||
|
ASSERT_NE(itor, commands.end());
|
||||||
|
auto storeRegMem = reinterpret_cast<MI_STORE_REGISTER_MEM *>(*itor);
|
||||||
|
EXPECT_EQ(crossThreadGpuVa + offsets[0], storeRegMem->getMemoryAddress());
|
||||||
|
|
||||||
|
itor = find<MI_STORE_REGISTER_MEM *>(++itor, commands.end());
|
||||||
|
ASSERT_NE(itor, commands.end());
|
||||||
|
EXPECT_EQ(*itor, args.commandsToPatch[0].command);
|
||||||
|
storeRegMem = reinterpret_cast<MI_STORE_REGISTER_MEM *>(*itor);
|
||||||
|
EXPECT_EQ(crossThreadGpuVa + offsets[1], storeRegMem->getMemoryAddress());
|
||||||
|
EXPECT_EQ(args.commandsToPatch[0].address, offsets[1]);
|
||||||
|
;
|
||||||
|
EXPECT_EQ(args.commandsToPatch[0].offset, storeRegMem->getRegisterAddress());
|
||||||
|
|
||||||
|
itor = find<MI_STORE_REGISTER_MEM *>(++itor, commands.end());
|
||||||
|
ASSERT_NE(itor, commands.end());
|
||||||
|
storeRegMem = reinterpret_cast<MI_STORE_REGISTER_MEM *>(*itor);
|
||||||
|
EXPECT_EQ(crossThreadGpuVa + offsets[2], storeRegMem->getMemoryAddress());
|
||||||
|
|
||||||
|
itor = find<MI_STORE_REGISTER_MEM *>(++itor, commands.end());
|
||||||
|
ASSERT_EQ(itor, commands.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
HWTEST_F(CommandEncoderMathTest, givenPayloadArgumentStoredInInlineDataWhenSettingGlobalGroupSizeIndirectThenInlineDataRelatedCommandIsStoredInCommandsToPatch) {
|
||||||
|
using MI_STORE_REGISTER_MEM = typename FamilyType::MI_STORE_REGISTER_MEM;
|
||||||
|
|
||||||
|
CommandContainer cmdContainer;
|
||||||
|
cmdContainer.initialize(pDevice, nullptr, HeapSize::defaultHeapSize, true, false);
|
||||||
|
|
||||||
|
CrossThreadDataOffset offsets[3] = {0, sizeof(uint32_t), 2 * sizeof(uint32_t)};
|
||||||
|
uint64_t crossThreadGpuVa = 0xBADF000;
|
||||||
|
|
||||||
|
IndirectParamsInInlineDataArgs args{};
|
||||||
|
args.storeGlobalWorkSizeInInlineData[1] = true;
|
||||||
|
|
||||||
|
uint32_t lws[3] = {1, 2, 3};
|
||||||
|
|
||||||
|
EncodeIndirectParams<FamilyType>::setGlobalWorkSizeIndirect(cmdContainer, offsets, crossThreadGpuVa, lws, &args);
|
||||||
|
|
||||||
|
EXPECT_EQ(1u, args.commandsToPatch.size());
|
||||||
|
|
||||||
|
GenCmdList commands;
|
||||||
|
CmdParse<FamilyType>::parseCommandBuffer(commands, ptrOffset(cmdContainer.getCommandStream()->getCpuBase(), 0), cmdContainer.getCommandStream()->getUsed());
|
||||||
|
|
||||||
|
auto itor = commands.begin();
|
||||||
|
|
||||||
|
itor = find<MI_STORE_REGISTER_MEM *>(itor, commands.end());
|
||||||
|
ASSERT_NE(itor, commands.end());
|
||||||
|
auto storeRegMem = reinterpret_cast<MI_STORE_REGISTER_MEM *>(*itor);
|
||||||
|
EXPECT_EQ(crossThreadGpuVa + offsets[0], storeRegMem->getMemoryAddress());
|
||||||
|
|
||||||
|
itor = find<MI_STORE_REGISTER_MEM *>(++itor, commands.end());
|
||||||
|
ASSERT_NE(itor, commands.end());
|
||||||
|
EXPECT_EQ(*itor, args.commandsToPatch[0].command);
|
||||||
|
storeRegMem = reinterpret_cast<MI_STORE_REGISTER_MEM *>(*itor);
|
||||||
|
EXPECT_EQ(offsets[1], storeRegMem->getMemoryAddress());
|
||||||
|
EXPECT_EQ(args.commandsToPatch[0].address, offsets[1]);
|
||||||
|
;
|
||||||
|
EXPECT_EQ(args.commandsToPatch[0].offset, storeRegMem->getRegisterAddress());
|
||||||
|
|
||||||
|
itor = find<MI_STORE_REGISTER_MEM *>(++itor, commands.end());
|
||||||
|
ASSERT_NE(itor, commands.end());
|
||||||
|
storeRegMem = reinterpret_cast<MI_STORE_REGISTER_MEM *>(*itor);
|
||||||
|
EXPECT_EQ(crossThreadGpuVa + offsets[2], storeRegMem->getMemoryAddress());
|
||||||
|
|
||||||
|
itor = find<MI_STORE_REGISTER_MEM *>(++itor, commands.end());
|
||||||
|
ASSERT_EQ(itor, commands.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
HWTEST_F(CommandEncoderMathTest, givenPayloadArgumentStoredInInlineDataWhenSettingWorkDimIndirectThenInlineDataRelatedCommandIsStoredInCommandsToPatch) {
|
||||||
|
using MI_STORE_REGISTER_MEM = typename FamilyType::MI_STORE_REGISTER_MEM;
|
||||||
|
|
||||||
|
CommandContainer cmdContainer;
|
||||||
|
cmdContainer.initialize(pDevice, nullptr, HeapSize::defaultHeapSize, true, false);
|
||||||
|
|
||||||
|
CrossThreadDataOffset offset = sizeof(uint32_t);
|
||||||
|
uint64_t crossThreadGpuVa = 0xBADF000;
|
||||||
|
|
||||||
|
IndirectParamsInInlineDataArgs args{};
|
||||||
|
args.storeWorkDimInInlineData = true;
|
||||||
|
|
||||||
|
uint32_t groupSizes[3] = {1, 2, 3};
|
||||||
|
|
||||||
|
EncodeIndirectParams<FamilyType>::setWorkDimIndirect(cmdContainer, offset, crossThreadGpuVa, groupSizes, &args);
|
||||||
|
|
||||||
|
EXPECT_EQ(1u, args.commandsToPatch.size());
|
||||||
|
|
||||||
|
GenCmdList commands;
|
||||||
|
CmdParse<FamilyType>::parseCommandBuffer(commands, ptrOffset(cmdContainer.getCommandStream()->getCpuBase(), 0), cmdContainer.getCommandStream()->getUsed());
|
||||||
|
|
||||||
|
auto itor = commands.begin();
|
||||||
|
|
||||||
|
itor = find<MI_STORE_REGISTER_MEM *>(++itor, commands.end());
|
||||||
|
ASSERT_NE(itor, commands.end());
|
||||||
|
EXPECT_EQ(*itor, args.commandsToPatch[0].command);
|
||||||
|
auto storeRegMem = reinterpret_cast<MI_STORE_REGISTER_MEM *>(*itor);
|
||||||
|
EXPECT_EQ(args.commandsToPatch[0].address, offset);
|
||||||
|
;
|
||||||
|
EXPECT_EQ(args.commandsToPatch[0].offset, storeRegMem->getRegisterAddress());
|
||||||
|
|
||||||
|
itor = find<MI_STORE_REGISTER_MEM *>(++itor, commands.end());
|
||||||
|
ASSERT_EQ(itor, commands.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
HWTEST_F(CommandEncoderMathTest, givenPayloadArgumentStoredInInlineDataWhenEncodeIndirectParamsAndApplyingInlineGpuVaThenCorrectCommandsAreProgrammed) {
|
||||||
|
using MI_STORE_REGISTER_MEM = typename FamilyType::MI_STORE_REGISTER_MEM;
|
||||||
|
|
||||||
|
for (auto workDimInInlineData : ::testing::Bool()) {
|
||||||
|
CommandContainer cmdContainer;
|
||||||
|
cmdContainer.initialize(pDevice, nullptr, HeapSize::defaultHeapSize, true, false);
|
||||||
|
|
||||||
|
uint64_t crossThreadGpuVa = 0xBADF000;
|
||||||
|
|
||||||
|
IndirectParamsInInlineDataArgs args{};
|
||||||
|
|
||||||
|
MockDispatchKernelEncoder dispatchInterface;
|
||||||
|
|
||||||
|
auto &kernelDescriptor = dispatchInterface.kernelDescriptor;
|
||||||
|
uint32_t groupSizes[3] = {1, 2, 3};
|
||||||
|
dispatchInterface.getGroupSizeResult = groupSizes;
|
||||||
|
|
||||||
|
kernelDescriptor.kernelAttributes.inlineDataPayloadSize = 0x100;
|
||||||
|
kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups[0] = 0x8;
|
||||||
|
kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups[1] = 0x120;
|
||||||
|
kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups[2] = undefined<CrossThreadDataOffset>;
|
||||||
|
|
||||||
|
kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize[0] = undefined<CrossThreadDataOffset>;
|
||||||
|
kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize[1] = 0x20;
|
||||||
|
kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize[2] = 0x100;
|
||||||
|
|
||||||
|
kernelDescriptor.payloadMappings.dispatchTraits.workDim = workDimInInlineData ? 0x60 : 0x110;
|
||||||
|
|
||||||
|
EncodeIndirectParams<FamilyType>::encode(cmdContainer, crossThreadGpuVa, &dispatchInterface, 0u, &args);
|
||||||
|
|
||||||
|
if (workDimInInlineData) {
|
||||||
|
EXPECT_EQ(3u, args.commandsToPatch.size());
|
||||||
|
} else {
|
||||||
|
EXPECT_EQ(2u, args.commandsToPatch.size());
|
||||||
|
}
|
||||||
|
EXPECT_TRUE(args.storeGroupCountInInlineData[0]);
|
||||||
|
EXPECT_FALSE(args.storeGroupCountInInlineData[1]);
|
||||||
|
EXPECT_FALSE(args.storeGroupCountInInlineData[2]);
|
||||||
|
|
||||||
|
EXPECT_FALSE(args.storeGlobalWorkSizeInInlineData[0]);
|
||||||
|
EXPECT_TRUE(args.storeGlobalWorkSizeInInlineData[1]);
|
||||||
|
EXPECT_FALSE(args.storeGlobalWorkSizeInInlineData[2]);
|
||||||
|
|
||||||
|
EXPECT_EQ(workDimInInlineData, args.storeWorkDimInInlineData);
|
||||||
|
|
||||||
|
uint64_t inlineDataGpuVa = 0x12340000;
|
||||||
|
EncodeIndirectParams<FamilyType>::applyInlineDataGpuVA(args, inlineDataGpuVa);
|
||||||
|
|
||||||
|
GenCmdList commands;
|
||||||
|
CmdParse<FamilyType>::parseCommandBuffer(commands, ptrOffset(cmdContainer.getCommandStream()->getCpuBase(), 0), cmdContainer.getCommandStream()->getUsed());
|
||||||
|
|
||||||
|
auto itor = commands.begin();
|
||||||
|
itor = find<MI_STORE_REGISTER_MEM *>(itor, commands.end());
|
||||||
|
ASSERT_NE(itor, commands.end());
|
||||||
|
auto storeRegMem = reinterpret_cast<MI_STORE_REGISTER_MEM *>(*itor);
|
||||||
|
EXPECT_EQ(storeRegMem, args.commandsToPatch[0].command);
|
||||||
|
EXPECT_EQ(inlineDataGpuVa + kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups[0], storeRegMem->getMemoryAddress());
|
||||||
|
|
||||||
|
itor = find<MI_STORE_REGISTER_MEM *>(++itor, commands.end());
|
||||||
|
ASSERT_NE(itor, commands.end());
|
||||||
|
storeRegMem = reinterpret_cast<MI_STORE_REGISTER_MEM *>(*itor);
|
||||||
|
EXPECT_EQ(crossThreadGpuVa + kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups[1], storeRegMem->getMemoryAddress());
|
||||||
|
|
||||||
|
itor = find<MI_STORE_REGISTER_MEM *>(++itor, commands.end());
|
||||||
|
ASSERT_NE(itor, commands.end());
|
||||||
|
storeRegMem = reinterpret_cast<MI_STORE_REGISTER_MEM *>(*itor);
|
||||||
|
EXPECT_EQ(storeRegMem, args.commandsToPatch[1].command);
|
||||||
|
EXPECT_EQ(inlineDataGpuVa + kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize[1], storeRegMem->getMemoryAddress());
|
||||||
|
|
||||||
|
itor = find<MI_STORE_REGISTER_MEM *>(++itor, commands.end());
|
||||||
|
ASSERT_NE(itor, commands.end());
|
||||||
|
storeRegMem = reinterpret_cast<MI_STORE_REGISTER_MEM *>(*itor);
|
||||||
|
EXPECT_EQ(crossThreadGpuVa + kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize[2], storeRegMem->getMemoryAddress());
|
||||||
|
|
||||||
|
itor = find<MI_STORE_REGISTER_MEM *>(++itor, commands.end());
|
||||||
|
ASSERT_NE(itor, commands.end());
|
||||||
|
storeRegMem = reinterpret_cast<MI_STORE_REGISTER_MEM *>(*itor);
|
||||||
|
if (workDimInInlineData) {
|
||||||
|
EXPECT_EQ(storeRegMem, args.commandsToPatch[2].command);
|
||||||
|
EXPECT_EQ(inlineDataGpuVa + kernelDescriptor.payloadMappings.dispatchTraits.workDim, storeRegMem->getMemoryAddress());
|
||||||
|
} else {
|
||||||
|
EXPECT_EQ(crossThreadGpuVa + kernelDescriptor.payloadMappings.dispatchTraits.workDim, storeRegMem->getMemoryAddress());
|
||||||
|
}
|
||||||
|
|
||||||
|
itor = find<MI_STORE_REGISTER_MEM *>(++itor, commands.end());
|
||||||
|
ASSERT_EQ(itor, commands.end());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
using CommandEncodeAluTests = ::testing::Test;
|
using CommandEncodeAluTests = ::testing::Test;
|
||||||
|
|
||||||
HWTEST_F(CommandEncodeAluTests, whenAskingForIncrementOrDecrementCmdsSizeThenReturnCorrectValue) {
|
HWTEST_F(CommandEncodeAluTests, whenAskingForIncrementOrDecrementCmdsSizeThenReturnCorrectValue) {
|
||||||
|
|
Loading…
Reference in New Issue