Disable implicit scaling for cooperative kernels

When implicit scaling is disabled use useSingleSubdeviceValue = true. Resolves: NEO-5757 Signed-off-by: Filip Hazubski <filip.hazubski@intel.com>
2026-01-03 06:49:52 +08:00 · 2021-06-21 15:24:14 +00:00
parent b5d5784b81
commit 29c64c3dd0
27 changed files with 256 additions and 107 deletions
--- a/shared/source/command_container/command_encoder.h
+++ b/shared/source/command_container/command_encoder.h
@@ -44,7 +44,8 @@ struct EncodeDispatchKernel {
                       bool &requiresUncachedMocs,
                       bool useGlobalAtomics,
                       uint32_t &partitionCount,
-                       bool isInternal);
+                       bool isInternal,
+                       bool isCooperative);

    static void encodeAdditionalWalkerFields(const HardwareInfo &hwInfo, WALKER_TYPE &walkerCmd);

@@ -52,7 +53,8 @@ struct EncodeDispatchKernel {

    static void *getInterfaceDescriptor(CommandContainer &container, uint32_t &iddOffset);

-    static size_t estimateEncodeDispatchKernelCmdsSize(Device *device, Vec3<size_t> groupStart, Vec3<size_t> groupCount, bool isInternal);
+    static size_t estimateEncodeDispatchKernelCmdsSize(Device *device, Vec3<size_t> groupStart, Vec3<size_t> groupCount,
+                                                       bool isInternal, bool isCooperative);

    static bool isRuntimeLocalIdsGenerationRequired(uint32_t activeChannels,
                                                    size_t *lws,
--- a/shared/source/command_container/command_encoder_bdw_and_later.inl
+++ b/shared/source/command_container/command_encoder_bdw_and_later.inl
@@ -26,7 +26,7 @@ template <typename Family>
 void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
                                          const void *pThreadGroupDimensions, bool isIndirect, bool isPredicate, DispatchKernelEncoderI *dispatchInterface,
                                          uint64_t eventAddress, bool isTimestampEvent, bool L3FlushEnable, Device *device, PreemptionMode preemptionMode,
-                                          bool &requiresUncachedMocs, bool useGlobalAtomics, uint32_t &partitionCount, bool isInternal) {
+                                          bool &requiresUncachedMocs, bool useGlobalAtomics, uint32_t &partitionCount, bool isInternal, bool isCooperative) {

    using MEDIA_STATE_FLUSH = typename Family::MEDIA_STATE_FLUSH;
    using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename Family::MEDIA_INTERFACE_DESCRIPTOR_LOAD;
@@ -49,7 +49,8 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
    if (!isIndirect) {
        threadDimsVec = {threadDims[0], threadDims[1], threadDims[2]};
    }
-    size_t estimatedSizeRequired = estimateEncodeDispatchKernelCmdsSize(device, threadStartVec, threadDimsVec, isInternal);
+    size_t estimatedSizeRequired = estimateEncodeDispatchKernelCmdsSize(device, threadStartVec, threadDimsVec,
+                                                                        isInternal, isCooperative);
    if (container.getCommandStream()->getAvailableSpace() < estimatedSizeRequired) {
        auto bbEnd = listCmdBufferStream->getSpaceForCmd<MI_BATCH_BUFFER_END>();
        *bbEnd = Family::cmdInitBatchBufferEnd;
@@ -319,7 +320,9 @@ template <typename Family>
 void EncodeDispatchKernel<Family>::appendAdditionalIDDFields(INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, const HardwareInfo &hwInfo, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy) {}

 template <typename Family>
-size_t EncodeDispatchKernel<Family>::estimateEncodeDispatchKernelCmdsSize(Device *device, Vec3<size_t> groupStart, Vec3<size_t> groupCount, bool isInternal) {
+size_t EncodeDispatchKernel<Family>::estimateEncodeDispatchKernelCmdsSize(Device *device, Vec3<size_t> groupStart,
+                                                                          Vec3<size_t> groupCount, bool isInternal,
+                                                                          bool isCooperative) {
    using MEDIA_STATE_FLUSH = typename Family::MEDIA_STATE_FLUSH;
    using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename Family::MEDIA_INTERFACE_DESCRIPTOR_LOAD;
    using MI_BATCH_BUFFER_END = typename Family::MI_BATCH_BUFFER_END;
--- a/shared/source/command_container/command_encoder_xehp_and_later.inl
+++ b/shared/source/command_container/command_encoder_xehp_and_later.inl
@@ -38,7 +38,7 @@ template <typename Family>
 void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
                                          const void *pThreadGroupDimensions, bool isIndirect, bool isPredicate, DispatchKernelEncoderI *dispatchInterface,
                                          uint64_t eventAddress, bool isTimestampEvent, bool L3FlushEnable, Device *device, PreemptionMode preemptionMode,
-                                          bool &requiresUncachedMocs, bool useGlobalAtomics, uint32_t &partitionCount, bool isInternal) {
+                                          bool &requiresUncachedMocs, bool useGlobalAtomics, uint32_t &partitionCount, bool isInternal, bool isCooperative) {
    using SHARED_LOCAL_MEMORY_SIZE = typename Family::INTERFACE_DESCRIPTOR_DATA::SHARED_LOCAL_MEMORY_SIZE;
    using STATE_BASE_ADDRESS = typename Family::STATE_BASE_ADDRESS;
    using MI_BATCH_BUFFER_END = typename Family::MI_BATCH_BUFFER_END;
@@ -59,7 +59,8 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
    if (!isIndirect) {
        threadDimsVec = {threadDims[0], threadDims[1], threadDims[2]};
    }
-    size_t estimatedSizeRequired = estimateEncodeDispatchKernelCmdsSize(device, threadStartVec, threadDimsVec, isInternal);
+    size_t estimatedSizeRequired = estimateEncodeDispatchKernelCmdsSize(device, threadStartVec, threadDimsVec,
+                                                                        isInternal, isCooperative);
    if (container.getCommandStream()->getAvailableSpace() < estimatedSizeRequired) {
        auto bbEnd = listCmdBufferStream->getSpaceForCmd<MI_BATCH_BUFFER_END>();
        *bbEnd = Family::cmdInitBatchBufferEnd;
@@ -256,7 +257,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,

    PreemptionHelper::applyPreemptionWaCmdsBegin<Family>(listCmdBufferStream, *device);

-    if (ImplicitScalingHelper::isImplicitScalingEnabled(device->getDeviceBitfield(), true) &&
+    if (ImplicitScalingHelper::isImplicitScalingEnabled(device->getDeviceBitfield(), !isCooperative) &&
        !isInternal) {
        const uint64_t workPartitionAllocationGpuVa = device->getDefaultEngine().commandStreamReceiver->getWorkPartitionAllocationGpuAddress();
        ImplicitScalingDispatch<Family>::dispatchCommands(*listCmdBufferStream,
@@ -421,15 +422,17 @@ void EncodeDispatchKernel<Family>::encodeThreadData(WALKER_TYPE &walkerCmd,
 }

 template <typename Family>
-size_t EncodeDispatchKernel<Family>::estimateEncodeDispatchKernelCmdsSize(Device *device, Vec3<size_t> groupStart, Vec3<size_t> groupCount,
-                                                                          bool isInternal) {
+size_t EncodeDispatchKernel<Family>::estimateEncodeDispatchKernelCmdsSize(Device *device, Vec3<size_t> groupStart,
+                                                                          Vec3<size_t> groupCount, bool isInternal,
+                                                                          bool isCooperative) {
    size_t totalSize = sizeof(WALKER_TYPE);
    totalSize += PreemptionHelper::getPreemptionWaCsSize<Family>(*device);
    totalSize += EncodeStates<Family>::getAdjustStateComputeModeSize();
    totalSize += EncodeIndirectParams<Family>::getCmdsSizeForIndirectParams();
    totalSize += EncodeIndirectParams<Family>::getCmdsSizeForSetGroupCountIndirect();
    totalSize += EncodeIndirectParams<Family>::getCmdsSizeForSetGroupSizeIndirect();
-    if (ImplicitScalingHelper::isImplicitScalingEnabled(device->getDeviceBitfield(), true) &&
+
+    if (ImplicitScalingHelper::isImplicitScalingEnabled(device->getDeviceBitfield(), !isCooperative) &&
        !isInternal) {
        const bool staticPartitioning = device->getDefaultEngine().commandStreamReceiver->isStaticWorkPartitioningEnabled();
        totalSize += ImplicitScalingDispatch<Family>::getSize(true, staticPartitioning, device->getDeviceBitfield(), groupStart, groupCount);
--- a/shared/source/command_stream/command_stream_receiver.h
+++ b/shared/source/command_stream/command_stream_receiver.h
@@ -83,7 +83,7 @@ class CommandStreamReceiver {
                                      uint32_t taskLevel, DispatchFlags &dispatchFlags, Device &device) = 0;

    virtual bool flushBatchedSubmissions() = 0;
-    bool submitBatchBuffer(BatchBuffer &batchBuffer, ResidencyContainer &allocationsForResidency);
+    MOCKABLE_VIRTUAL bool submitBatchBuffer(BatchBuffer &batchBuffer, ResidencyContainer &allocationsForResidency);
    virtual void programHardwareContext(LinearStream &cmdStream) = 0;
    virtual size_t getCmdsSizeForHardwareContext() const = 0;

--- a/shared/source/command_stream/tbx_command_stream_receiver_hw.h
+++ b/shared/source/command_stream/tbx_command_stream_receiver_hw.h
@@ -58,7 +58,7 @@ class TbxCommandStreamReceiverHw : public CommandStreamReceiverSimulatedHw<GfxFa
    AubSubCaptureStatus checkAndActivateAubSubCapture(const MultiDispatchInfo &dispatchInfo) override;

    // Family specific version
-    MOCKABLE_VIRTUAL void submitBatchBuffer(uint64_t batchBufferGpuAddress, const void *batchBuffer, size_t batchBufferSize, uint32_t memoryBank, uint64_t entryBits, bool overrideRingHead);
+    MOCKABLE_VIRTUAL void submitBatchBufferTbx(uint64_t batchBufferGpuAddress, const void *batchBuffer, size_t batchBufferSize, uint32_t memoryBank, uint64_t entryBits, bool overrideRingHead);
    void pollForCompletion() override;

    void dumpAllocation(GraphicsAllocation &gfxAllocation) override;
--- a/shared/source/command_stream/tbx_command_stream_receiver_hw.inl
+++ b/shared/source/command_stream/tbx_command_stream_receiver_hw.inl
@@ -241,7 +241,7 @@ bool TbxCommandStreamReceiverHw<GfxFamily>::flush(BatchBuffer &batchBuffer, Resi
        }
    }

-    submitBatchBuffer(
+    submitBatchBufferTbx(
        batchBufferGpuAddress, pBatchBuffer, sizeBatchBuffer,
        this->getMemoryBank(batchBuffer.commandBufferAllocation),
        this->getPPGTTAdditionalBits(batchBuffer.commandBufferAllocation),
@@ -256,7 +256,7 @@ bool TbxCommandStreamReceiverHw<GfxFamily>::flush(BatchBuffer &batchBuffer, Resi
 }

 template <typename GfxFamily>
-void TbxCommandStreamReceiverHw<GfxFamily>::submitBatchBuffer(uint64_t batchBufferGpuAddress, const void *batchBuffer, size_t batchBufferSize, uint32_t memoryBank, uint64_t entryBits, bool overrideRingHead) {
+void TbxCommandStreamReceiverHw<GfxFamily>::submitBatchBufferTbx(uint64_t batchBufferGpuAddress, const void *batchBuffer, size_t batchBufferSize, uint32_t memoryBank, uint64_t entryBits, bool overrideRingHead) {
    if (hardwareContextController) {
        if (batchBufferSize) {
            hardwareContextController->submit(batchBufferGpuAddress, batchBuffer, batchBufferSize, memoryBank, MemoryConstants::pageSize64k, overrideRingHead);