Implement implicit args for indirect dispatch

Related-To: NEO-5081
Signed-off-by: Mateusz Jablonski <mateusz.jablonski@intel.com>
This commit is contained in:
Mateusz Jablonski
2021-09-24 12:20:21 +00:00
committed by Compute-Runtime-Automation
parent 8fd49c4e7c
commit 92bf4b978a
9 changed files with 338 additions and 27 deletions

View File

@@ -38,6 +38,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
auto sizeCrossThreadData = dispatchInterface->getCrossThreadDataSize();
auto sizePerThreadData = dispatchInterface->getPerThreadDataSize();
auto sizePerThreadDataForWholeGroup = dispatchInterface->getPerThreadDataSizeForWholeThreadGroup();
auto pImplicitArgs = dispatchInterface->getImplicitArgs();
const HardwareInfo &hwInfo = device->getHardwareInfo();
@@ -51,7 +52,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
threadDimsVec = {threadDims[0], threadDims[1], threadDims[2]};
}
size_t estimatedSizeRequired = estimateEncodeDispatchKernelCmdsSize(device, threadStartVec, threadDimsVec,
isInternal, isCooperative);
isInternal, isCooperative, isIndirect, dispatchInterface);
if (container.getCommandStream()->getAvailableSpace() < estimatedSizeRequired) {
auto bbEnd = listCmdBufferStream->getSpaceForCmd<MI_BATCH_BUFFER_END>();
*bbEnd = Family::cmdInitBatchBufferEnd;
@@ -142,7 +143,6 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
UNRECOVERABLE_IF(!(ptr));
offsetThreadData = heapIndirect->getHeapGpuStartOffset() + static_cast<uint64_t>(heapIndirect->getUsed() - sizeThreadData);
auto pImplicitArgs = dispatchInterface->getImplicitArgs();
if (pImplicitArgs) {
offsetThreadData -= sizeof(ImplicitArgs);
pImplicitArgs->localIdTablePtr = heapIndirect->getGraphicsAllocation()->getGpuAddress() + heapIndirect->getUsed() - iohRequiredSize;
@@ -154,7 +154,11 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
if (isIndirect) {
void *gpuPtr = reinterpret_cast<void *>(heapIndirect->getGraphicsAllocation()->getGpuAddress() + heapIndirect->getUsed() - sizeThreadData);
EncodeIndirectParams<Family>::encode(container, gpuPtr, dispatchInterface);
void *implicitArgsGpuPtr = nullptr;
if (pImplicitArgs) {
implicitArgsGpuPtr = reinterpret_cast<void *>(reinterpret_cast<uint64_t>(gpuPtr) - sizeof(ImplicitArgs));
}
EncodeIndirectParams<Family>::encode(container, gpuPtr, dispatchInterface, implicitArgsGpuPtr);
}
ptr = ptrOffset(ptr, sizeCrossThreadData);
@@ -330,7 +334,7 @@ void EncodeDispatchKernel<Family>::appendAdditionalIDDFields(INTERFACE_DESCRIPTO
template <typename Family>
size_t EncodeDispatchKernel<Family>::estimateEncodeDispatchKernelCmdsSize(Device *device, const Vec3<size_t> &groupStart,
const Vec3<size_t> &groupCount, bool isInternal,
bool isCooperative) {
bool isCooperative, bool isIndirect, DispatchKernelEncoderI *dispatchInterface) {
using MEDIA_STATE_FLUSH = typename Family::MEDIA_STATE_FLUSH;
using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename Family::MEDIA_INTERFACE_DESCRIPTOR_LOAD;
using MI_BATCH_BUFFER_END = typename Family::MI_BATCH_BUFFER_END;
@@ -345,6 +349,15 @@ size_t EncodeDispatchKernel<Family>::estimateEncodeDispatchKernelCmdsSize(Device
totalSize += EncodeIndirectParams<Family>::getCmdsSizeForIndirectParams();
totalSize += EncodeIndirectParams<Family>::getCmdsSizeForSetGroupCountIndirect();
totalSize += EncodeIndirectParams<Family>::getCmdsSizeForSetGroupSizeIndirect();
if (isIndirect) {
UNRECOVERABLE_IF(dispatchInterface == nullptr);
totalSize += EncodeIndirectParams<Family>::getCmdsSizeForSetWorkDimIndirect(dispatchInterface->getGroupSize(), false);
if (dispatchInterface->getImplicitArgs()) {
totalSize += EncodeIndirectParams<Family>::getCmdsSizeForSetGroupCountIndirect();
totalSize += EncodeIndirectParams<Family>::getCmdsSizeForSetGroupSizeIndirect();
totalSize += EncodeIndirectParams<Family>::getCmdsSizeForSetWorkDimIndirect(dispatchInterface->getGroupSize(), true);
}
}
totalSize += sizeof(MI_BATCH_BUFFER_END);