Use uint64_t instead of void * in indirect dispatch programming

Signed-off-by: Mateusz Jablonski <mateusz.jablonski@intel.com>
This commit is contained in:
Mateusz Jablonski 2021-10-06 17:32:04 +00:00 committed by Compute-Runtime-Automation
parent 5560663b01
commit 5d2d81b2d1
8 changed files with 25 additions and 25 deletions

View File

@ -230,7 +230,7 @@ struct CommandListCoreFamily : CommandListImp {
void applyMemoryRangesBarrier(uint32_t numRanges, const size_t *pRangeSizes,
const void **pRanges);
ze_result_t setGlobalWorkSizeIndirect(NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress, uint32_t lws[3]);
ze_result_t setGlobalWorkSizeIndirect(NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress, uint32_t lws[3]);
ze_result_t programSyncBuffer(Kernel &kernel, NEO::Device &device, const ze_group_count_t *pThreadGroupDimensions);
void appendWriteKernelTimestamp(ze_event_handle_t hEvent, bool beforeWalker, bool maskLsb);
void adjustWriteKernelTimestamp(uint64_t globalAddress, uint64_t contextAddress, bool maskLsb, uint32_t mask);

View File

@ -2246,7 +2246,7 @@ void CommandListCoreFamily<gfxCoreFamily>::clearCommandsToPatch() {
}
template <GFXCORE_FAMILY gfxCoreFamily>
ze_result_t CommandListCoreFamily<gfxCoreFamily>::setGlobalWorkSizeIndirect(NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress, uint32_t lws[3]) {
ze_result_t CommandListCoreFamily<gfxCoreFamily>::setGlobalWorkSizeIndirect(NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress, uint32_t lws[3]) {
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
NEO::EncodeIndirectParams<GfxFamily>::setGlobalWorkSizeIndirect(commandContainer, offsets, crossThreadAddress, lws);

View File

@ -1043,7 +1043,7 @@ HWTEST_F(CmdlistAppendLaunchKernelTests, whenEncodingWorkDimForIndirectDispatchT
uint32_t groupSize[] = {1, 1, 1};
auto estimate = EncodeIndirectParams<FamilyType>::getCmdsSizeForSetWorkDimIndirect(groupSize, false);
auto sizeBefore = commandList->commandContainer.getCommandStream()->getUsed();
EncodeIndirectParams<FamilyType>::setWorkDimIndirect(commandList->commandContainer, 0x4, nullptr, groupSize);
EncodeIndirectParams<FamilyType>::setWorkDimIndirect(commandList->commandContainer, 0x4, 0u, groupSize);
auto sizeAfter = commandList->commandContainer.getCommandStream()->getUsed();
EXPECT_LE(sizeAfter - sizeBefore, estimate);
}
@ -1051,7 +1051,7 @@ HWTEST_F(CmdlistAppendLaunchKernelTests, whenEncodingWorkDimForIndirectDispatchT
uint32_t groupSize[] = {1, 1, 2};
auto estimate = EncodeIndirectParams<FamilyType>::getCmdsSizeForSetWorkDimIndirect(groupSize, false);
auto sizeBefore = commandList->commandContainer.getCommandStream()->getUsed();
EncodeIndirectParams<FamilyType>::setWorkDimIndirect(commandList->commandContainer, 0x4, nullptr, groupSize);
EncodeIndirectParams<FamilyType>::setWorkDimIndirect(commandList->commandContainer, 0x4, 0u, groupSize);
auto sizeAfter = commandList->commandContainer.getCommandStream()->getUsed();
EXPECT_LE(sizeAfter - sizeBefore, estimate);
}
@ -1059,7 +1059,7 @@ HWTEST_F(CmdlistAppendLaunchKernelTests, whenEncodingWorkDimForIndirectDispatchT
uint32_t groupSize[] = {1, 1, 1};
auto estimate = EncodeIndirectParams<FamilyType>::getCmdsSizeForSetWorkDimIndirect(groupSize, true);
auto sizeBefore = commandList->commandContainer.getCommandStream()->getUsed();
EncodeIndirectParams<FamilyType>::setWorkDimIndirect(commandList->commandContainer, 0x2, nullptr, groupSize);
EncodeIndirectParams<FamilyType>::setWorkDimIndirect(commandList->commandContainer, 0x2, 0u, groupSize);
auto sizeAfter = commandList->commandContainer.getCommandStream()->getUsed();
EXPECT_LE(sizeAfter - sizeBefore, estimate);
}
@ -1067,7 +1067,7 @@ HWTEST_F(CmdlistAppendLaunchKernelTests, whenEncodingWorkDimForIndirectDispatchT
uint32_t groupSize[] = {1, 1, 2};
auto estimate = EncodeIndirectParams<FamilyType>::getCmdsSizeForSetWorkDimIndirect(groupSize, true);
auto sizeBefore = commandList->commandContainer.getCommandStream()->getUsed();
EncodeIndirectParams<FamilyType>::setWorkDimIndirect(commandList->commandContainer, 0x2, nullptr, groupSize);
EncodeIndirectParams<FamilyType>::setWorkDimIndirect(commandList->commandContainer, 0x2, 0u, groupSize);
auto sizeAfter = commandList->commandContainer.getCommandStream()->getUsed();
EXPECT_LE(sizeAfter - sizeBefore, estimate);
}

View File

@ -170,10 +170,10 @@ struct EncodeIndirectParams {
using MI_MATH = typename GfxFamily::MI_MATH;
using MI_MATH_ALU_INST_INLINE = typename GfxFamily::MI_MATH_ALU_INST_INLINE;
static void encode(CommandContainer &container, void *crossThreadDataGpuVa, DispatchKernelEncoderI *dispatchInterface, void *implicitArgsGpuPtr);
static void setGroupCountIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress);
static void setWorkDimIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offset, void *crossThreadAddress, const uint32_t *groupSize);
static void setGlobalWorkSizeIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress, const uint32_t *lws);
static void encode(CommandContainer &container, uint64_t crossThreadDataGpuVa, DispatchKernelEncoderI *dispatchInterface, uint64_t implicitArgsGpuPtr);
static void setGroupCountIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress);
static void setWorkDimIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offset, uint64_t crossThreadAddress, const uint32_t *groupSize);
static void setGlobalWorkSizeIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress, const uint32_t *lws);
static size_t getCmdsSizeForIndirectParams();
static size_t getCmdsSizeForSetGroupSizeIndirect();

View File

@ -514,7 +514,7 @@ template <typename Family>
void EncodeDispatchKernel<Family>::adjustTimestampPacket(WALKER_TYPE &walkerCmd, const HardwareInfo &hwInfo) {}
template <typename Family>
void EncodeIndirectParams<Family>::encode(CommandContainer &container, void *crossThreadDataGpuVa, DispatchKernelEncoderI *dispatchInterface, void *implicitArgsGpuPtr) {
void EncodeIndirectParams<Family>::encode(CommandContainer &container, uint64_t crossThreadDataGpuVa, DispatchKernelEncoderI *dispatchInterface, uint64_t implicitArgsGpuPtr) {
const auto &kernelDescriptor = dispatchInterface->getKernelDescriptor();
setGroupCountIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups, crossThreadDataGpuVa);
setGlobalWorkSizeIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize, crossThreadDataGpuVa, dispatchInterface->getGroupSize());
@ -530,19 +530,19 @@ void EncodeIndirectParams<Family>::encode(CommandContainer &container, void *cro
}
template <typename Family>
void EncodeIndirectParams<Family>::setGroupCountIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress) {
void EncodeIndirectParams<Family>::setGroupCountIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress) {
for (int i = 0; i < 3; ++i) {
if (NEO::isUndefinedOffset(offsets[i])) {
continue;
}
EncodeStoreMMIO<Family>::encode(*container.getCommandStream(), GPUGPU_DISPATCHDIM[i], ptrOffset(reinterpret_cast<uint64_t>(crossThreadAddress), offsets[i]));
EncodeStoreMMIO<Family>::encode(*container.getCommandStream(), GPUGPU_DISPATCHDIM[i], ptrOffset(crossThreadAddress, offsets[i]));
}
}
template <typename Family>
void EncodeIndirectParams<Family>::setWorkDimIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset workDimOffset, void *crossThreadAddress, const uint32_t *groupSize) {
void EncodeIndirectParams<Family>::setWorkDimIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset workDimOffset, uint64_t crossThreadAddress, const uint32_t *groupSize) {
if (NEO::isValidOffset(workDimOffset)) {
auto dstPtr = ptrOffset(reinterpret_cast<uint64_t>(crossThreadAddress), workDimOffset);
auto dstPtr = ptrOffset(crossThreadAddress, workDimOffset);
constexpr uint32_t RESULT_REGISTER = CS_GPR_R0;
constexpr AluRegisters RESULT_ALU_REGISTER = AluRegisters::R_0;
const uint32_t offset = static_cast<uint32_t>((1ull << 8 * (dstPtr & 0b11)) - 1);
@ -650,12 +650,12 @@ template <typename Family>
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const HardwareInfo &hwInfo) {}
template <typename Family>
void EncodeIndirectParams<Family>::setGlobalWorkSizeIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress, const uint32_t *lws) {
void EncodeIndirectParams<Family>::setGlobalWorkSizeIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress, const uint32_t *lws) {
for (int i = 0; i < 3; ++i) {
if (NEO::isUndefinedOffset(offsets[i])) {
continue;
}
EncodeMathMMIO<Family>::encodeMulRegVal(container, GPUGPU_DISPATCHDIM[i], lws[i], ptrOffset(reinterpret_cast<uint64_t>(crossThreadAddress), offsets[i]));
EncodeMathMMIO<Family>::encodeMulRegVal(container, GPUGPU_DISPATCHDIM[i], lws[i], ptrOffset(crossThreadAddress, offsets[i]));
}
}

View File

@ -153,10 +153,10 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
dispatchInterface->getCrossThreadData(), sizeCrossThreadData);
if (isIndirect) {
void *gpuPtr = reinterpret_cast<void *>(heapIndirect->getGraphicsAllocation()->getGpuAddress() + heapIndirect->getUsed() - sizeThreadData);
void *implicitArgsGpuPtr = nullptr;
auto gpuPtr = heapIndirect->getGraphicsAllocation()->getGpuAddress() + heapIndirect->getUsed() - sizeThreadData;
uint64_t implicitArgsGpuPtr = 0u;
if (pImplicitArgs) {
implicitArgsGpuPtr = reinterpret_cast<void *>(reinterpret_cast<uint64_t>(gpuPtr) - sizeof(ImplicitArgs));
implicitArgsGpuPtr = gpuPtr - sizeof(ImplicitArgs);
}
EncodeIndirectParams<Family>::encode(container, gpuPtr, dispatchInterface, implicitArgsGpuPtr);
}

View File

@ -187,10 +187,10 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
crossThreadData, sizeCrossThreadData);
}
if (isIndirect) {
void *gpuPtr = reinterpret_cast<void *>(heap->getGraphicsAllocation()->getGpuAddress() + static_cast<uint64_t>(heap->getUsed() - sizeThreadData - inlineDataProgrammingOffset));
void *implicitArgsGpuPtr = nullptr;
auto gpuPtr = heap->getGraphicsAllocation()->getGpuAddress() + static_cast<uint64_t>(heap->getUsed() - sizeThreadData - inlineDataProgrammingOffset);
uint64_t implicitArgsGpuPtr = 0u;
if (pImplicitArgs) {
implicitArgsGpuPtr = reinterpret_cast<void *>(reinterpret_cast<uint64_t>(gpuPtr) + inlineDataProgrammingOffset - sizeof(ImplicitArgs));
implicitArgsGpuPtr = gpuPtr + inlineDataProgrammingOffset - sizeof(ImplicitArgs);
}
EncodeIndirectParams<Family>::encode(container, gpuPtr, dispatchInterface, implicitArgsGpuPtr);
}

View File

@ -229,7 +229,7 @@ HWTEST_F(CommandEncoderMathTest, WhenSettingGroupSizeIndirectThenCommandsAreCorr
uint32_t crossThreadAdress[3] = {};
uint32_t lws[3] = {2, 1, 1};
EncodeIndirectParams<FamilyType>::setGlobalWorkSizeIndirect(cmdContainer, offsets, crossThreadAdress, lws);
EncodeIndirectParams<FamilyType>::setGlobalWorkSizeIndirect(cmdContainer, offsets, reinterpret_cast<uint64_t>(crossThreadAdress), lws);
GenCmdList commands;
CmdParse<FamilyType>::parseCommandBuffer(commands, ptrOffset(cmdContainer.getCommandStream()->getCpuBase(), 0), cmdContainer.getCommandStream()->getUsed());
@ -254,7 +254,7 @@ HWTEST_F(CommandEncoderMathTest, WhenSettingGroupCountIndirectThenCommandsAreCor
CrossThreadDataOffset offsets[3] = {0, sizeof(uint32_t), 2 * sizeof(uint32_t)};
uint32_t crossThreadAdress[3] = {};
EncodeIndirectParams<FamilyType>::setGroupCountIndirect(cmdContainer, offsets, crossThreadAdress);
EncodeIndirectParams<FamilyType>::setGroupCountIndirect(cmdContainer, offsets, reinterpret_cast<uint64_t>(crossThreadAdress));
GenCmdList commands;
CmdParse<FamilyType>::parseCommandBuffer(commands, ptrOffset(cmdContainer.getCommandStream()->getCpuBase(), 0), cmdContainer.getCommandStream()->getUsed());