Extend batch buffer flattening in AubCSR to BatchedDispatch mode

- batch buffer flatening in batched mode - added MI_USER_INTERRUPT command - added GUC Work Queue Item Change-Id: I35142da34b30d3006bb4ffc1521db7f6ebe68ebc
2026-01-09 22:43:00 +08:00 · 2018-04-04 11:34:46 +02:00
parent 31157573ca
commit a0c044e6d2
41 changed files with 1188 additions and 247 deletions
--- a/runtime/helpers/CMakeLists.txt
+++ b/runtime/helpers/CMakeLists.txt
@@ -44,6 +44,10 @@ set(RUNTIME_SRCS_HELPERS_BASE
  ${CMAKE_CURRENT_SOURCE_DIR}/error_mappers.h
  ${CMAKE_CURRENT_SOURCE_DIR}/file_io.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/file_io.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/flat_batch_buffer_helper.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/flat_batch_buffer_helper.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/flat_batch_buffer_helper_hw.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/flat_batch_buffer_helper_hw.inl
  ${CMAKE_CURRENT_SOURCE_DIR}/flush_stamp.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/flush_stamp.h
  ${CMAKE_CURRENT_SOURCE_DIR}/get_info.h
--- a/runtime/helpers/address_patch.h
+++ b/runtime/helpers/address_patch.h
@@ -32,7 +32,10 @@ enum PatchInfoAllocationType {
    DynamicStateHeap,
    IndirectObjectHeap,
    SurfaceStateHeap,
-    InstructionHeap
+    InstructionHeap,
+    TagAddress,
+    TagValue,
+    GUCStartMessage,
 };

 struct PatchInfoData {
@@ -42,5 +45,50 @@ struct PatchInfoData {
    uint64_t targetAllocation;
    uint64_t targetAllocationOffset;
    PatchInfoAllocationType targetType;
+    uint32_t patchAddressSize;
+
+    PatchInfoData(uint64_t sourceAllocation,
+                  uint64_t sourceAllocationOffset,
+                  PatchInfoAllocationType sourceType,
+                  uint64_t targetAllocation,
+                  uint64_t targetAllocationOffset,
+                  PatchInfoAllocationType targetType,
+                  uint32_t patchAddressSize)
+        : sourceAllocation(sourceAllocation),
+          sourceAllocationOffset(sourceAllocationOffset),
+          sourceType(sourceType),
+          targetAllocation(targetAllocation),
+          targetAllocationOffset(targetAllocationOffset),
+          targetType(targetType),
+          patchAddressSize(patchAddressSize) {
+    }
+
+    PatchInfoData(uint64_t sourceAllocation,
+                  uint64_t sourceAllocationOffset,
+                  PatchInfoAllocationType sourceType,
+                  uint64_t targetAllocation,
+                  uint64_t targetAllocationOffset,
+                  PatchInfoAllocationType targetType)
+        : sourceAllocation(sourceAllocation),
+          sourceAllocationOffset(sourceAllocationOffset),
+          sourceType(sourceType),
+          targetAllocation(targetAllocation),
+          targetAllocationOffset(targetAllocationOffset),
+          targetType(targetType),
+          patchAddressSize(sizeof(void *)) {
+    }
+
+    bool requiresIndirectPatching() {
+        return (targetType != PatchInfoAllocationType::Default && targetType != PatchInfoAllocationType::GUCStartMessage);
+    }
+};
+
+struct CommandChunk {
+    uint64_t baseAddressCpu = 0;
+    uint64_t baseAddressGpu = 0;
+    uint64_t startOffset = 0;
+    uint64_t endOffset = 0;
+    uint64_t batchBufferStartLocation = 0;
+    uint64_t batchBufferStartAddress = 0;
 };
 } // namespace OCLRT
--- a/runtime/helpers/flat_batch_buffer_helper.cpp
+++ b/runtime/helpers/flat_batch_buffer_helper.cpp
@@ -0,0 +1,78 @@
+/*
+* Copyright (c) 2018, Intel Corporation
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included
+* in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+* OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#include "runtime/helpers/flat_batch_buffer_helper.h"
+
+namespace OCLRT {
+
+bool FlatBatchBufferHelper::setPatchInfoData(const PatchInfoData &data) {
+    patchInfoCollection.push_back(data);
+    return true;
+}
+bool FlatBatchBufferHelper::removePatchInfoData(uint64_t targetLocation) {
+    for (auto it = patchInfoCollection.begin(); it != patchInfoCollection.end(); ++it) {
+        if (it->targetAllocation + it->targetAllocationOffset == targetLocation) {
+            patchInfoCollection.erase(it);
+            break;
+        }
+    }
+    return true;
+}
+
+bool FlatBatchBufferHelper::registerCommandChunk(uint64_t baseCpu, uint64_t baseGpu, uint64_t startOffset, uint64_t endOffset) {
+
+    CommandChunk commandChunk;
+    commandChunk.baseAddressGpu = baseGpu;
+    commandChunk.baseAddressCpu = baseCpu;
+    commandChunk.startOffset = startOffset;
+    commandChunk.endOffset = endOffset;
+    return registerCommandChunk(commandChunk);
+}
+
+bool FlatBatchBufferHelper::registerCommandChunk(BatchBuffer &batchBuffer, size_t batchBufferStartCommandSize) {
+    CommandChunk commandChunk;
+    commandChunk.baseAddressGpu = batchBuffer.stream->getGraphicsAllocation()->getGpuAddress();
+    commandChunk.baseAddressCpu = reinterpret_cast<uint64_t>(batchBuffer.stream->getCpuBase());
+    commandChunk.startOffset = batchBuffer.startOffset;
+    commandChunk.endOffset = batchBuffer.chainedBatchBufferStartOffset + batchBufferStartCommandSize;
+    return registerCommandChunk(commandChunk);
+}
+
+bool FlatBatchBufferHelper::registerCommandChunk(CommandChunk &commandChunk) {
+    commandChunkList.push_back(commandChunk);
+    return true;
+}
+
+bool FlatBatchBufferHelper::registerBatchBufferStartAddress(uint64_t commandAddress, uint64_t startAddress) {
+    batchBufferStartAddressSequence.insert(std::pair<uint64_t, uint64_t>(commandAddress, startAddress));
+    return true;
+}
+
+void FlatBatchBufferHelper::fixCrossThreadDataInfo(std::vector<PatchInfoData> &data, size_t offsetCrossThreadData, uint64_t gpuAddress) {
+    for (auto &patchInfoData : data) {
+        if (patchInfoData.sourceType == PatchInfoAllocationType::KernelArg) {
+            patchInfoData.targetAllocation = gpuAddress;
+            patchInfoData.targetAllocationOffset += offsetCrossThreadData;
+        }
+    }
+}
+};
--- a/runtime/helpers/flat_batch_buffer_helper.h
+++ b/runtime/helpers/flat_batch_buffer_helper.h
@@ -0,0 +1,63 @@
+/*
+* Copyright (c) 2018, Intel Corporation
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included
+* in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+* OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#pragma once
+
+#include "runtime/helpers/address_patch.h"
+#include "runtime/command_stream/submissions_aggregator.h"
+#include <map>
+#include <vector>
+
+namespace OCLRT {
+
+enum class DispatchMode;
+class MemoryManager;
+
+class FlatBatchBufferHelper {
+  public:
+    FlatBatchBufferHelper(MemoryManager *memoryManager) : memoryManager(memoryManager) {}
+    virtual ~FlatBatchBufferHelper(){};
+    MOCKABLE_VIRTUAL bool setPatchInfoData(const PatchInfoData &data);
+    MOCKABLE_VIRTUAL bool removePatchInfoData(uint64_t targetLocation);
+    MOCKABLE_VIRTUAL bool registerCommandChunk(uint64_t baseCpu, uint64_t baseGpu, uint64_t startOffset, uint64_t endOffset);
+    MOCKABLE_VIRTUAL bool registerCommandChunk(CommandChunk &commandChunk);
+    MOCKABLE_VIRTUAL bool registerCommandChunk(BatchBuffer &batchBuffer, size_t batchBufferStartCommandSize);
+    MOCKABLE_VIRTUAL bool registerBatchBufferStartAddress(uint64_t commandAddress, uint64_t startAddress);
+    virtual void *flattenBatchBuffer(BatchBuffer &batchBuffer, size_t &sizeBatchBuffer, DispatchMode dispatchMode) = 0;
+    virtual char *getIndirectPatchCommands(size_t &indirectPatchCommandsSize, std::vector<PatchInfoData> &indirectPatchInfo) = 0;
+    virtual void removePipeControlData(size_t pipeControlLocationSize, void *pipeControlForNooping) = 0;
+    void setMemoryManager(MemoryManager *memoryManager) { this->memoryManager = memoryManager; }
+    static void fixCrossThreadDataInfo(std::vector<PatchInfoData> &data, size_t offsetCrossThreadData, uint64_t gpuAddress);
+
+    std::vector<CommandChunk> &getCommandChunkList() { return commandChunkList; }
+    std::vector<PatchInfoData> &getPatchInfoCollection() { return patchInfoCollection; }
+    std::map<uint64_t, uint64_t> &getBatchBufferStartAddressSequence() { return batchBufferStartAddressSequence; }
+
+  protected:
+    MemoryManager *memoryManager = nullptr;
+
+    std::vector<PatchInfoData> patchInfoCollection;
+    std::vector<CommandChunk> commandChunkList;
+    std::map<uint64_t, uint64_t> batchBufferStartAddressSequence;
+};
+
+} // namespace OCLRT
--- a/runtime/helpers/flat_batch_buffer_helper_hw.h
+++ b/runtime/helpers/flat_batch_buffer_helper_hw.h
@@ -0,0 +1,40 @@
+/*
+* Copyright (c) 2018, Intel Corporation
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included
+* in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+* OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#pragma once
+
+#include "runtime/helpers/flat_batch_buffer_helper.h"
+
+namespace OCLRT {
+
+template <typename GfxFamily>
+class FlatBatchBufferHelperHw : public FlatBatchBufferHelper {
+  public:
+    FlatBatchBufferHelperHw(MemoryManager *memoryManager) : FlatBatchBufferHelper(memoryManager) {}
+    void *flattenBatchBuffer(BatchBuffer &batchBuffer, size_t &sizeBatchBuffer, DispatchMode dispatchMode) override;
+    char *getIndirectPatchCommands(size_t &indirectPatchCommandsSize, std::vector<PatchInfoData> &indirectPatchInfo) override;
+    void removePipeControlData(size_t pipeControlLocationSize, void *pipeControlForNooping) override;
+    static void sdiSetAddress(typename GfxFamily::MI_STORE_DATA_IMM *sdiCommand, uint64_t address);
+    static void sdiSetStoreQword(typename GfxFamily::MI_STORE_DATA_IMM *sdiCommand, bool setQword);
+};
+
+} // namespace OCLRT
--- a/runtime/helpers/flat_batch_buffer_helper_hw.inl
+++ b/runtime/helpers/flat_batch_buffer_helper_hw.inl
@@ -0,0 +1,196 @@
+/*
+* Copyright (c) 2018, Intel Corporation
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included
+* in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+* OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#include "runtime/helpers/flat_batch_buffer_helper_hw.h"
+#include "runtime/command_stream/command_stream_receiver.h"
+#include "runtime/memory_manager/memory_manager.h"
+#include "runtime/helpers/string.h"
+
+namespace OCLRT {
+
+template <typename GfxFamily>
+void *FlatBatchBufferHelperHw<GfxFamily>::flattenBatchBuffer(BatchBuffer &batchBuffer, size_t &sizeBatchBuffer, DispatchMode dispatchMode) {
+    typedef typename GfxFamily::MI_BATCH_BUFFER_START MI_BATCH_BUFFER_START;
+    typedef typename GfxFamily::MI_BATCH_BUFFER_END MI_BATCH_BUFFER_END;
+    typedef typename GfxFamily::MI_USER_INTERRUPT MI_USER_INTERRUPT;
+
+    void *flatBatchBuffer = nullptr;
+
+    size_t indirectPatchCommandsSize = 0u;
+    std::vector<PatchInfoData> indirectPatchInfo;
+    std::unique_ptr<char> indirectPatchCommands(getIndirectPatchCommands(indirectPatchCommandsSize, indirectPatchInfo));
+
+    if (dispatchMode == DispatchMode::ImmediateDispatch) {
+        if (batchBuffer.chainedBatchBuffer) {
+            batchBuffer.chainedBatchBuffer->setAllocationType(batchBuffer.chainedBatchBuffer->getAllocationType() | GraphicsAllocation::ALLOCATION_TYPE_NON_AUB_WRITABLE);
+            auto sizeMainBatchBuffer = batchBuffer.chainedBatchBufferStartOffset - batchBuffer.startOffset;
+
+            auto flatBatchBufferSize = alignUp(sizeMainBatchBuffer + indirectPatchCommandsSize + batchBuffer.chainedBatchBuffer->getUnderlyingBufferSize(), MemoryConstants::pageSize);
+            flatBatchBuffer = this->memoryManager->alignedMallocWrapper(flatBatchBufferSize, MemoryConstants::pageSize);
+            UNRECOVERABLE_IF(flatBatchBuffer == nullptr);
+            // Copy main batchbuffer
+            memcpy_s(flatBatchBuffer, sizeMainBatchBuffer, ptrOffset(batchBuffer.commandBufferAllocation->getUnderlyingBuffer(), batchBuffer.startOffset), sizeMainBatchBuffer);
+            // Copy indirect patch commands
+            memcpy_s(ptrOffset(flatBatchBuffer, sizeMainBatchBuffer), indirectPatchCommandsSize, indirectPatchCommands.get(), indirectPatchCommandsSize);
+            // Copy chained batchbuffer
+            memcpy_s(ptrOffset(flatBatchBuffer, sizeMainBatchBuffer + indirectPatchCommandsSize), batchBuffer.chainedBatchBuffer->getUnderlyingBufferSize(), batchBuffer.chainedBatchBuffer->getUnderlyingBuffer(), batchBuffer.chainedBatchBuffer->getUnderlyingBufferSize());
+            sizeBatchBuffer = flatBatchBufferSize;
+            patchInfoCollection.insert(std::end(patchInfoCollection), std::begin(indirectPatchInfo), std::end(indirectPatchInfo));
+        }
+    } else if (dispatchMode == DispatchMode::BatchedDispatch) {
+        CommandChunk firstChunk;
+        for (auto &chunk : commandChunkList) {
+            bool found = false;
+            for (auto &batchBuffer : batchBufferStartAddressSequence) {
+                if ((batchBuffer.first <= chunk.baseAddressGpu + chunk.endOffset) && (batchBuffer.first >= chunk.baseAddressGpu + chunk.startOffset)) {
+                    chunk.batchBufferStartLocation = batchBuffer.first;
+                    chunk.batchBufferStartAddress = batchBuffer.second;
+                    chunk.endOffset = chunk.batchBufferStartLocation - chunk.baseAddressGpu;
+                }
+                if (batchBuffer.second == chunk.baseAddressGpu + chunk.startOffset) {
+                    found = true;
+                }
+            }
+            if (!found) {
+                firstChunk = chunk;
+            }
+        }
+
+        std::vector<CommandChunk> orderedChunks;
+        CommandChunk &nextChunk = firstChunk;
+        while (true) {
+            bool hasNextChunk = false;
+            for (auto &chunk : commandChunkList) {
+                if (nextChunk.batchBufferStartAddress == chunk.baseAddressGpu + chunk.startOffset) {
+                    hasNextChunk = true;
+                    orderedChunks.push_back(nextChunk);
+                    nextChunk = chunk;
+                    break;
+                }
+            }
+            if (!hasNextChunk) {
+                nextChunk.endOffset -= sizeof(MI_BATCH_BUFFER_START);
+                orderedChunks.push_back(nextChunk);
+                break;
+            }
+        }
+
+        uint64_t flatBatchBufferSize = 0u;
+        std::vector<PatchInfoData> patchInfoCopy = patchInfoCollection;
+        patchInfoCollection.clear();
+
+        for (auto &chunk : orderedChunks) {
+            for (auto &patch : patchInfoCopy) {
+                if (patch.targetAllocation + patch.targetAllocationOffset >= chunk.baseAddressGpu + chunk.startOffset && patch.targetAllocation + patch.targetAllocationOffset <= chunk.baseAddressGpu + chunk.endOffset) {
+                    patch.targetAllocationOffset = patch.targetAllocationOffset - chunk.startOffset + flatBatchBufferSize + indirectPatchCommandsSize;
+                    patchInfoCollection.push_back(patch);
+                }
+            }
+            flatBatchBufferSize += chunk.endOffset - chunk.startOffset;
+        }
+        patchInfoCollection.insert(std::end(patchInfoCollection), std::begin(indirectPatchInfo), std::end(indirectPatchInfo));
+
+        flatBatchBufferSize += sizeof(MI_USER_INTERRUPT);
+        flatBatchBufferSize += sizeof(MI_BATCH_BUFFER_END);
+        flatBatchBufferSize += indirectPatchCommandsSize;
+
+        flatBatchBufferSize = alignUp(flatBatchBufferSize, MemoryConstants::pageSize);
+        flatBatchBufferSize += CSRequirements::csOverfetchSize;
+        flatBatchBuffer = this->memoryManager->alignedMallocWrapper(static_cast<size_t>(flatBatchBufferSize), MemoryConstants::pageSize);
+
+        char *ptr = reinterpret_cast<char *>(flatBatchBuffer);
+        memcpy_s(ptr, indirectPatchCommandsSize, indirectPatchCommands.get(), indirectPatchCommandsSize);
+        ptr += indirectPatchCommandsSize;
+        for (auto &chunk : orderedChunks) {
+            size_t chunkSize = static_cast<size_t>(chunk.endOffset - chunk.startOffset);
+            memcpy_s(ptr,
+                     chunkSize,
+                     reinterpret_cast<char *>(ptrOffset(chunk.baseAddressCpu, static_cast<size_t>(chunk.startOffset))),
+                     chunkSize);
+            ptr += chunkSize;
+        }
+
+        auto pCmdMui = reinterpret_cast<MI_USER_INTERRUPT *>(ptr);
+        pCmdMui->init();
+        ptr += sizeof(MI_USER_INTERRUPT);
+
+        auto pCmdBBend = reinterpret_cast<MI_BATCH_BUFFER_END *>(ptr);
+        *pCmdBBend = GfxFamily::cmdInitBatchBufferEnd;
+        ptr += sizeof(MI_BATCH_BUFFER_END);
+
+        sizeBatchBuffer = static_cast<size_t>(flatBatchBufferSize);
+        commandChunkList.clear();
+        batchBufferStartAddressSequence.clear();
+    }
+
+    return flatBatchBuffer;
+}
+
+template <typename GfxFamily>
+char *FlatBatchBufferHelperHw<GfxFamily>::getIndirectPatchCommands(size_t &indirectPatchCommandsSize, std::vector<PatchInfoData> &indirectPatchInfo) {
+    typedef typename GfxFamily::MI_STORE_DATA_IMM MI_STORE_DATA_IMM;
+
+    indirectPatchCommandsSize = 0;
+    for (auto &patchInfoData : patchInfoCollection) {
+        if (patchInfoData.requiresIndirectPatching()) {
+            indirectPatchCommandsSize += sizeof(MI_STORE_DATA_IMM);
+        }
+    }
+
+    uint64_t stiCommandOffset = 0;
+    std::vector<PatchInfoData> patchInfoCopy = patchInfoCollection;
+    std::unique_ptr<char> buffer(new char[indirectPatchCommandsSize]);
+    LinearStream indirectPatchCommandStream(buffer.get(), indirectPatchCommandsSize);
+    patchInfoCollection.clear();
+
+    for (auto &patchInfoData : patchInfoCopy) {
+        if (patchInfoData.requiresIndirectPatching()) {
+            auto storeDataImmediate = indirectPatchCommandStream.getSpaceForCmd<MI_STORE_DATA_IMM>();
+            storeDataImmediate->init();
+            sdiSetAddress(storeDataImmediate, patchInfoData.targetAllocation + patchInfoData.targetAllocationOffset);
+            sdiSetStoreQword(storeDataImmediate, patchInfoData.patchAddressSize != sizeof(uint32_t));
+            storeDataImmediate->setDataDword0(static_cast<uint32_t>((patchInfoData.sourceAllocation + patchInfoData.sourceAllocationOffset) & 0x0000FFFFFFFFULL));
+            storeDataImmediate->setDataDword1(static_cast<uint32_t>((patchInfoData.sourceAllocation + patchInfoData.sourceAllocationOffset) >> 32));
+
+            PatchInfoData patchInfoForAddress(patchInfoData.targetAllocation, patchInfoData.targetAllocationOffset, patchInfoData.targetType, 0u, stiCommandOffset + sizeof(MI_STORE_DATA_IMM) - 2 * sizeof(uint64_t), PatchInfoAllocationType::Default);
+            PatchInfoData patchInfoForValue(patchInfoData.sourceAllocation, patchInfoData.sourceAllocationOffset, patchInfoData.sourceType, 0u, stiCommandOffset + sizeof(MI_STORE_DATA_IMM) - sizeof(uint64_t), PatchInfoAllocationType::Default);
+            indirectPatchInfo.push_back(patchInfoForAddress);
+            indirectPatchInfo.push_back(patchInfoForValue);
+            stiCommandOffset += sizeof(MI_STORE_DATA_IMM);
+        } else {
+            patchInfoCollection.push_back(patchInfoData);
+        }
+    }
+    return buffer.release();
+}
+template <typename GfxFamily>
+void FlatBatchBufferHelperHw<GfxFamily>::removePipeControlData(size_t pipeControlLocationSize, void *pipeControlForNooping) {
+    typedef typename GfxFamily::PIPE_CONTROL PIPE_CONTROL;
+    size_t numPipeControls = pipeControlLocationSize / sizeof(PIPE_CONTROL);
+    for (size_t i = 0; i < numPipeControls; i++) {
+        PIPE_CONTROL *erasedPipeControl = reinterpret_cast<PIPE_CONTROL *>(pipeControlForNooping);
+        removePatchInfoData(reinterpret_cast<uint64_t>(erasedPipeControl) + (i + 1) * sizeof(PIPE_CONTROL) - 2 * sizeof(uint64_t));
+        removePatchInfoData(reinterpret_cast<uint64_t>(erasedPipeControl) + (i + 1) * sizeof(PIPE_CONTROL) - sizeof(uint64_t));
+    }
+}
+
+}; // namespace OCLRT
--- a/runtime/helpers/kernel_commands.inl
+++ b/runtime/helpers/kernel_commands.inl
@@ -155,6 +155,7 @@ size_t KernelCommandsHelper<GfxFamily>::sendInterfaceDescriptorData(
    // Program the kernel start pointer
    pInterfaceDescriptor->setKernelStartPointerHigh(kernelStartOffset >> 32);
    pInterfaceDescriptor->setKernelStartPointer((uint32_t)kernelStartOffset);
+
    // # of threads in thread group should be based on LWS.
    pInterfaceDescriptor->setNumberOfThreadsInGpgpuThreadGroup(threadsPerThreadGroup);

@@ -234,10 +235,7 @@ size_t KernelCommandsHelper<GfxFamily>::sendCrossThreadData(
    memcpy_s(pDest, sizeCrossThreadData, kernel.getCrossThreadData(), sizeCrossThreadData);

    if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) {
-        for (auto &patchInfoData : kernel.getPatchInfoDataList()) {
-            patchInfoData.targetAllocation = indirectHeap.getGpuBase();
-            patchInfoData.targetAllocationOffset += offsetCrossThreadData;
-        }
+        FlatBatchBufferHelper::fixCrossThreadDataInfo(kernel.getPatchInfoDataList(), offsetCrossThreadData, indirectHeap.getGraphicsAllocation()->getGpuAddress());
    }

    return offsetCrossThreadData + static_cast<size_t>(indirectHeap.getHeapGpuStartOffset());
@@ -399,6 +397,11 @@ size_t KernelCommandsHelper<GfxFamily>::sendIndirectState(
        !!patchInfo.executionEnvironment->HasBarriers,
        preemptionMode);

+    if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) {
+        PatchInfoData patchInfoData(kernelStartOffset, 0, PatchInfoAllocationType::InstructionHeap, dsh.getGraphicsAllocation()->getGpuAddress(), offsetInterfaceDescriptor, PatchInfoAllocationType::DynamicStateHeap);
+        kernel.getPatchInfoDataList().push_back(patchInfoData);
+    }
+
    // Program media state flush to set interface descriptor offset
    KernelCommandsHelper<GfxFamily>::sendMediaStateFlush(
        commandStream,