Refactor partitioning of dispatched kernels

Related-To: NEO-6589

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz
2022-01-13 17:21:59 +00:00
committed by Compute-Runtime-Automation
parent 182042b04d
commit 9c4f05387b
12 changed files with 53 additions and 56 deletions

View File

@@ -61,7 +61,8 @@ struct EncodeDispatchKernel {
static void *getInterfaceDescriptor(CommandContainer &container, uint32_t &iddOffset);
static size_t estimateEncodeDispatchKernelCmdsSize(Device *device, const Vec3<size_t> &groupStart, const Vec3<size_t> &groupCount,
bool isInternal, bool isCooperative, bool isIndirect, DispatchKernelEncoderI *dispatchInterface);
bool isInternal, bool isCooperative, bool isIndirect, DispatchKernelEncoderI *dispatchInterface,
bool isPartitioned);
static bool isRuntimeLocalIdsGenerationRequired(uint32_t activeChannels,
size_t *lws,

View File

@@ -50,7 +50,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
}
size_t estimatedSizeRequired = estimateEncodeDispatchKernelCmdsSize(args.device, threadStartVec, threadDimsVec,
args.isInternal, args.isCooperative, args.isIndirect,
args.dispatchInterface);
args.dispatchInterface, false);
if (container.getCommandStream()->getAvailableSpace() < estimatedSizeRequired) {
auto bbEnd = listCmdBufferStream->getSpaceForCmd<MI_BATCH_BUFFER_END>();
*bbEnd = Family::cmdInitBatchBufferEnd;
@@ -333,7 +333,8 @@ void EncodeDispatchKernel<Family>::appendAdditionalIDDFields(INTERFACE_DESCRIPTO
template <typename Family>
size_t EncodeDispatchKernel<Family>::estimateEncodeDispatchKernelCmdsSize(Device *device, const Vec3<size_t> &groupStart,
const Vec3<size_t> &groupCount, bool isInternal,
bool isCooperative, bool isIndirect, DispatchKernelEncoderI *dispatchInterface) {
bool isCooperative, bool isIndirect, DispatchKernelEncoderI *dispatchInterface,
bool isPartitioned) {
using MEDIA_STATE_FLUSH = typename Family::MEDIA_STATE_FLUSH;
using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename Family::MEDIA_INTERFACE_DESCRIPTOR_LOAD;
using MI_BATCH_BUFFER_END = typename Family::MI_BATCH_BUFFER_END;

View File

@@ -59,7 +59,8 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
threadDimsVec = {threadDims[0], threadDims[1], threadDims[2]};
}
size_t estimatedSizeRequired = estimateEncodeDispatchKernelCmdsSize(args.device, threadStartVec, threadDimsVec,
args.isInternal, args.isCooperative, args.isIndirect, args.dispatchInterface);
args.isInternal, args.isCooperative, args.isIndirect, args.dispatchInterface,
args.partitionCount > 1);
if (container.getCommandStream()->getAvailableSpace() < estimatedSizeRequired) {
auto bbEnd = listCmdBufferStream->getSpaceForCmd<MI_BATCH_BUFFER_END>();
*bbEnd = Family::cmdInitBatchBufferEnd;
@@ -199,7 +200,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
}
bool requiresGlobalAtomicsUpdate = false;
if (ImplicitScalingHelper::isImplicitScalingEnabled(container.getDevice()->getDeviceBitfield(), true)) {
if (args.partitionCount > 1) {
requiresGlobalAtomicsUpdate = container.lastSentUseGlobalAtomics != args.useGlobalAtomics;
container.lastSentUseGlobalAtomics = args.useGlobalAtomics;
}
@@ -269,7 +270,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
PreemptionHelper::applyPreemptionWaCmdsBegin<Family>(listCmdBufferStream, *args.device);
if (ImplicitScalingHelper::isImplicitScalingEnabled(args.device->getDeviceBitfield(), !args.isCooperative) &&
if ((args.partitionCount > 1 && !args.isCooperative) &&
!args.isInternal) {
const uint64_t workPartitionAllocationGpuVa = args.device->getDefaultEngine().commandStreamReceiver->getWorkPartitionAllocationGpuAddress();
if (args.eventAddress != 0) {
@@ -440,7 +441,8 @@ void EncodeDispatchKernel<Family>::encodeThreadData(WALKER_TYPE &walkerCmd,
template <typename Family>
size_t EncodeDispatchKernel<Family>::estimateEncodeDispatchKernelCmdsSize(Device *device, const Vec3<size_t> &groupStart,
const Vec3<size_t> &groupCount, bool isInternal,
bool isCooperative, bool isIndirect, DispatchKernelEncoderI *dispatchInterface) {
bool isCooperative, bool isIndirect, DispatchKernelEncoderI *dispatchInterface,
bool isPartitioned) {
size_t totalSize = sizeof(WALKER_TYPE);
totalSize += PreemptionHelper::getPreemptionWaCsSize<Family>(*device);
totalSize += EncodeStates<Family>::getAdjustStateComputeModeSize();
@@ -457,7 +459,7 @@ size_t EncodeDispatchKernel<Family>::estimateEncodeDispatchKernelCmdsSize(Device
}
}
if (ImplicitScalingHelper::isImplicitScalingEnabled(device->getDeviceBitfield(), !isCooperative) &&
if ((isPartitioned && !isCooperative) &&
!isInternal) {
const bool staticPartitioning = device->getDefaultEngine().commandStreamReceiver->isStaticWorkPartitioningEnabled();
totalSize += ImplicitScalingDispatch<Family>::getSize(true, staticPartitioning, device->getDeviceBitfield(), groupStart, groupCount);