feature: implement ISA allocation pooling in OpenCL

Related-To: NEO-12287 Signed-off-by: Fabian Zwoliński <fabian.zwolinski@intel.com>
2025-12-20 08:53:55 +08:00 · 2025-11-17 15:26:03 +00:00
parent a627594e83
commit 1b9b78ac16
25 changed files with 824 additions and 87 deletions
--- a/opencl/source/command_queue/hardware_interface_xehp_and_later.inl
+++ b/opencl/source/command_queue/hardware_interface_xehp_and_later.inl
@@ -124,8 +124,8 @@ inline void HardwareInterface<GfxFamily>::programWalker(
    auto isCcsUsed = EngineHelpers::isCcs(commandQueue.getGpgpuEngine().osContext->getEngineType());

    if constexpr (heaplessModeEnabled == false) {
-        if (auto kernelAllocation = kernelInfo.getGraphicsAllocation()) {
-            EncodeMemoryPrefetch<GfxFamily>::programMemoryPrefetch(commandStream, *kernelAllocation, kernelInfo.heapInfo.kernelHeapSize, 0, rootDeviceEnvironment);
+        if (auto kernelAllocation = kernelInfo.getIsaGraphicsAllocation()) {
+            EncodeMemoryPrefetch<GfxFamily>::programMemoryPrefetch(commandStream, *kernelAllocation, kernelInfo.heapInfo.kernelHeapSize, kernelInfo.getIsaOffsetInParentAllocation(), rootDeviceEnvironment);
        }
    }

--- a/opencl/source/kernel/kernel.cpp
+++ b/opencl/source/kernel/kernel.cpp
@@ -515,7 +515,7 @@ cl_int Kernel::getInfo(cl_kernel_info paramName, size_t paramValueSize,
        srcSize = getKernelHeapSize();
        break;
    case CL_KERNEL_BINARY_GPU_ADDRESS_INTEL:
-        nonCannonizedGpuAddress = gmmHelper->decanonize(kernelInfo.kernelAllocation->getGpuAddress());
+        nonCannonizedGpuAddress = gmmHelper->decanonize(kernelInfo.getIsaGraphicsAllocation()->getGpuAddress() + kernelInfo.getIsaOffsetInParentAllocation());
        pSrc = &nonCannonizedGpuAddress;
        srcSize = sizeof(nonCannonizedGpuAddress);
        break;
@@ -788,21 +788,23 @@ void Kernel::substituteKernelHeap(void *newKernelHeap, size_t newKernelHeapSize)
    pKernelInfo->isKernelHeapSubstituted = true;
    auto memoryManager = executionEnvironment.memoryManager.get();

-    auto currentAllocationSize = pKernelInfo->kernelAllocation->getUnderlyingBufferSize();
+    auto currentAllocationSize = pKernelInfo->getIsaSize();
    bool status = false;
    auto &rootDeviceEnvironment = clDevice.getRootDeviceEnvironment();
    auto &helper = rootDeviceEnvironment.getHelper<GfxCoreHelper>();
    size_t isaPadding = helper.getPaddingForISAAllocation();

+    DEBUG_BREAK_IF(nullptr != pKernelInfo->getIsaParentAllocation());
+
    if (currentAllocationSize >= newKernelHeapSize + isaPadding) {
        auto &productHelper = rootDeviceEnvironment.getHelper<ProductHelper>();
-        auto useBlitter = productHelper.isBlitCopyRequiredForLocalMemory(rootDeviceEnvironment, *pKernelInfo->getGraphicsAllocation());
+        auto useBlitter = productHelper.isBlitCopyRequiredForLocalMemory(rootDeviceEnvironment, *pKernelInfo->getIsaGraphicsAllocation());
        status = MemoryTransferHelper::transferMemoryToAllocation(useBlitter,
-                                                                  clDevice.getDevice(), pKernelInfo->getGraphicsAllocation(), 0, newKernelHeap,
+                                                                  clDevice.getDevice(), pKernelInfo->getIsaGraphicsAllocation(), 0, newKernelHeap,
                                                                  static_cast<size_t>(newKernelHeapSize));
    } else {
-        memoryManager->checkGpuUsageAndDestroyGraphicsAllocations(pKernelInfo->kernelAllocation);
-        pKernelInfo->kernelAllocation = nullptr;
+        memoryManager->checkGpuUsageAndDestroyGraphicsAllocations(pKernelInfo->getIsaGraphicsAllocation());
+        pKernelInfo->setIsaPerKernelAllocation(nullptr);
        status = pKernelInfo->createKernelAllocation(clDevice.getDevice(), isBuiltIn);
    }
    UNRECOVERABLE_IF(!status);
@@ -1315,7 +1317,7 @@ void Kernel::makeResident(CommandStreamReceiver &commandStreamReceiver) {
    }
    makeArgsResident(commandStreamReceiver);

-    auto kernelIsaAllocation = this->kernelInfo.kernelAllocation;
+    auto kernelIsaAllocation = this->kernelInfo.getIsaGraphicsAllocation();
    if (kernelIsaAllocation) {
        commandStreamReceiver.makeResident(*kernelIsaAllocation);
    }
@@ -1381,8 +1383,8 @@ void Kernel::getResidency(std::vector<Surface *> &dst) {
        }
    }

-    auto kernelIsaAllocation = this->kernelInfo.kernelAllocation;
-    if (kernelIsaAllocation) {
+    if (auto kernelIsaAllocation = this->kernelInfo.getIsaGraphicsAllocation();
+        kernelIsaAllocation != nullptr) {
        GeneralSurface *surface = new GeneralSurface(kernelIsaAllocation);
        dst.push_back(surface);
    }
@@ -1922,12 +1924,13 @@ bool Kernel::hasIndirectStatelessAccessToHostMemory() const {
 }

 uint64_t Kernel::getKernelStartAddress(const bool localIdsGenerationByRuntime, const bool kernelUsesLocalIds, const bool isCssUsed, const bool returnFullAddress) const {
-
    uint64_t kernelStartOffset = 0;

-    if (kernelInfo.getGraphicsAllocation()) {
-        kernelStartOffset = returnFullAddress ? kernelInfo.getGraphicsAllocation()->getGpuAddress()
-                                              : kernelInfo.getGraphicsAllocation()->getGpuAddressToPatch();
+    if (kernelInfo.getIsaGraphicsAllocation()) {
+        auto offsetInParentAllocation = kernelInfo.getIsaOffsetInParentAllocation();
+        kernelStartOffset = returnFullAddress ? kernelInfo.getIsaGraphicsAllocation()->getGpuAddress() + offsetInParentAllocation
+                                              : kernelInfo.getIsaGraphicsAllocation()->getGpuAddressToPatch() + offsetInParentAllocation;
+
        if (localIdsGenerationByRuntime == false && kernelUsesLocalIds == true) {
            kernelStartOffset += kernelInfo.kernelDescriptor.entryPoints.skipPerThreadDataLoad;
        }
--- a/opencl/source/program/process_device_binary.cpp
+++ b/opencl/source/program/process_device_binary.cpp
@@ -16,6 +16,7 @@
 #include "shared/source/helpers/compiler_product_helper.h"
 #include "shared/source/helpers/debug_helpers.h"
 #include "shared/source/helpers/file_io.h"
+#include "shared/source/helpers/gfx_core_helper.h"
 #include "shared/source/helpers/hw_info.h"
 #include "shared/source/helpers/ptr_math.h"
 #include "shared/source/helpers/string.h"
@@ -109,14 +110,16 @@ cl_int Program::linkBinary(Device *pDevice, const void *constantsInitData, size_
        exportedFunctionsKernelId = static_cast<size_t>(linkerInput->getExportedFunctionsSegmentId());
        // Exported functions reside in instruction heap of one of kernels
        auto exportedFunctionHeapId = linkerInput->getExportedFunctionsSegmentId();
-        buildInfos[rootDeviceIndex].exportedFunctionsSurface = kernelInfoArray[exportedFunctionHeapId]->getGraphicsAllocation();
+        buildInfos[rootDeviceIndex].exportedFunctionsSurface = kernelInfoArray[exportedFunctionHeapId]->getIsaGraphicsAllocation();
+        auto offsetInParentAllocation = kernelInfoArray[exportedFunctionHeapId]->getIsaOffsetInParentAllocation();
+
        auto &compilerProductHelper = pDevice->getCompilerProductHelper();
        if (compilerProductHelper.isHeaplessModeEnabled(pDevice->getHardwareInfo())) {
-            exportedFunctions.gpuAddress = static_cast<uintptr_t>(buildInfos[rootDeviceIndex].exportedFunctionsSurface->getGpuAddress());
+            exportedFunctions.gpuAddress = static_cast<uintptr_t>(buildInfos[rootDeviceIndex].exportedFunctionsSurface->getGpuAddress() + offsetInParentAllocation);
        } else {
-            exportedFunctions.gpuAddress = static_cast<uintptr_t>(buildInfos[rootDeviceIndex].exportedFunctionsSurface->getGpuAddressToPatch());
+            exportedFunctions.gpuAddress = static_cast<uintptr_t>(buildInfos[rootDeviceIndex].exportedFunctionsSurface->getGpuAddressToPatch() + offsetInParentAllocation);
        }
-        exportedFunctions.segmentSize = buildInfos[rootDeviceIndex].exportedFunctionsSurface->getUnderlyingBufferSize();
+        exportedFunctions.segmentSize = kernelInfoArray[exportedFunctionHeapId]->getIsaSize();
    }
    Linker::PatchableSegments isaSegmentsForPatching;
    std::vector<std::vector<char>> patchedIsaTempStorage;
@@ -128,8 +131,8 @@ cl_int Program::linkBinary(Device *pDevice, const void *constantsInitData, size_
            auto &kernHeapInfo = kernelInfo->heapInfo;
            const char *originalIsa = reinterpret_cast<const char *>(kernHeapInfo.pKernelHeap);
            patchedIsaTempStorage.push_back(std::vector<char>(originalIsa, originalIsa + kernHeapInfo.kernelHeapSize));
-            DEBUG_BREAK_IF(nullptr == kernelInfo->getGraphicsAllocation());
-            isaSegmentsForPatching.push_back(Linker::PatchableSegment{patchedIsaTempStorage.rbegin()->data(), static_cast<uintptr_t>(kernelInfo->getGraphicsAllocation()->getGpuAddressToPatch()), kernHeapInfo.kernelHeapSize});
+            DEBUG_BREAK_IF(nullptr == kernelInfo->getIsaGraphicsAllocation());
+            isaSegmentsForPatching.push_back(Linker::PatchableSegment{patchedIsaTempStorage.rbegin()->data(), static_cast<uintptr_t>(kernelInfo->getIsaGraphicsAllocation()->getGpuAddressToPatch() + kernelInfo->getIsaOffsetInParentAllocation()), kernHeapInfo.kernelHeapSize});
            kernelDescriptors.push_back(&kernelInfo->kernelDescriptor);
        }
    }
@@ -151,16 +154,8 @@ cl_int Program::linkBinary(Device *pDevice, const void *constantsInitData, size_
        updateBuildLog(pDevice->getRootDeviceIndex(), error.c_str(), error.size());
        return CL_INVALID_BINARY;
    } else if (linkerInput->getTraits().requiresPatchingOfInstructionSegments) {
-        for (auto kernelId = 0u; kernelId < kernelInfoArray.size(); kernelId++) {
-            const auto &kernelInfo = kernelInfoArray[kernelId];
-            auto &kernHeapInfo = kernelInfo->heapInfo;
-            auto segmentId = &kernelInfo - &kernelInfoArray[0];
-            auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment();
-            const auto &productHelper = pDevice->getProductHelper();
-            MemoryTransferHelper::transferMemoryToAllocation(productHelper.isBlitCopyRequiredForLocalMemory(rootDeviceEnvironment, *kernelInfo->getGraphicsAllocation()),
-                                                             *pDevice, kernelInfo->getGraphicsAllocation(), 0, isaSegmentsForPatching[segmentId].hostPointer,
-                                                             static_cast<size_t>(kernHeapInfo.kernelHeapSize));
-        }
+        [[maybe_unused]] auto success = transferIsaSegmentsToAllocation(pDevice, kernelInfoArray, &isaSegmentsForPatching, rootDeviceIndex);
+        DEBUG_BREAK_IF(!success);
    }
    DBG_LOG(PrintRelocations, NEO::constructRelocationsDebugMessage(this->getSymbols(pDevice->getRootDeviceIndex())));
    return CL_SUCCESS;
@@ -328,19 +323,11 @@ cl_int Program::processProgramInfo(ProgramInfo &src, const ClDevice &clDevice) {
    }
    buildInfos[rootDeviceIndex].kernelMiscInfoPos = src.kernelMiscInfoPos;

-    for (auto &kernelInfo : kernelInfoArray) {
-        cl_int retVal = CL_SUCCESS;
-        if (kernelInfo->heapInfo.kernelHeapSize) {
-            retVal = kernelInfo->createKernelAllocation(clDevice.getDevice(), isBuiltIn) ? CL_SUCCESS : CL_OUT_OF_HOST_MEMORY;
-        }
-
-        if (retVal != CL_SUCCESS) {
+    if (auto retVal = setIsaGraphicsAllocations(clDevice.getDevice(), kernelInfoArray, deviceInfoConstants, rootDeviceIndex);
+        retVal != CL_SUCCESS) {
        return retVal;
    }

-        kernelInfo->apply(deviceInfoConstants);
-    }
-
    indirectDetectionVersion = src.indirectDetectionVersion;
    indirectAccessBufferMajorVersion = src.indirectAccessBufferMajorVersion;

@@ -383,8 +370,16 @@ Zebin::Debug::Segments Program::getZebinSegments(uint32_t rootDeviceIndex) {
                                       buildInfos[rootDeviceIndex].constStringSectionData.size};
    std::vector<NEO::Zebin::Debug::Segments::KernelNameIsaTupleT> kernels;
    for (const auto &kernelInfo : buildInfos[rootDeviceIndex].kernelInfoArray) {
+        NEO::Zebin::Debug::Segments::Segment segment;
+
+        if (kernelInfo->getIsaParentAllocation()) {
+            segment.address = static_cast<uintptr_t>(kernelInfo->getIsaGraphicsAllocation()->getGpuAddress() + kernelInfo->getIsaOffsetInParentAllocation());
+            segment.size = kernelInfo->getIsaSubAllocationSize();
+        } else {
+            segment.address = static_cast<uintptr_t>(kernelInfo->getIsaGraphicsAllocation()->getGpuAddress());
+            segment.size = kernelInfo->getIsaGraphicsAllocation()->getUnderlyingBufferSize();
+        }

-        NEO::Zebin::Debug::Segments::Segment segment = {static_cast<uintptr_t>(kernelInfo->getGraphicsAllocation()->getGpuAddress()), kernelInfo->getGraphicsAllocation()->getUnderlyingBufferSize()};
        kernels.push_back({kernelInfo->kernelDescriptor.kernelMetadata.kernelName, segment});
    }
    return Zebin::Debug::Segments(getGlobalSurface(rootDeviceIndex), getConstantSurface(rootDeviceIndex), strings, kernels);
--- a/opencl/source/program/program.cpp
+++ b/opencl/source/program/program.cpp
@@ -19,6 +19,7 @@
 #include "shared/source/execution_environment/execution_environment.h"
 #include "shared/source/execution_environment/root_device_environment.h"
 #include "shared/source/helpers/addressing_mode_helper.h"
+#include "shared/source/helpers/aligned_memory.h"
 #include "shared/source/helpers/api_specific_config.h"
 #include "shared/source/helpers/compiler_options_parser.h"
 #include "shared/source/helpers/compiler_product_helper.h"
@@ -29,10 +30,13 @@
 #include "shared/source/os_interface/os_context.h"
 #include "shared/source/program/kernel_info.h"
 #include "shared/source/program/metadata_generation.h"
+#include "shared/source/utilities/buffer_pool_allocator.inl"

 #include "opencl/source/cl_device/cl_device.h"
 #include "opencl/source/context/context.h"

+#include <algorithm>
+
 namespace NEO {

 Program::Program(Context *context, bool isBuiltIn, const ClDeviceVector &clDevicesIn) : executionEnvironment(*clDevicesIn[0]->getExecutionEnvironment()),
@@ -301,6 +305,172 @@ cl_int Program::createProgramFromBinary(
    return retVal;
 }

+bool Program::isIsaPoolingEnabled(Device &neoDevice) {
+    if (auto dbgFlag = debugManager.flags.EnableIsaAllocationPool.get(); dbgFlag != -1) {
+        return static_cast<bool>(dbgFlag);
+    }
+
+    return neoDevice.getProductHelper().is2MBLocalMemAlignmentEnabled() &&
+           nullptr == neoDevice.getL0Debugger() &&
+           false == gtpinIsGTPinInitialized() &&
+           false == neoDevice.getMemoryManager()->isKernelBinaryReuseEnabled();
+}
+
+cl_int Program::setIsaGraphicsAllocations(Device &neoDevice, std::vector<KernelInfo *> &kernelInfoArray, DeviceInfoKernelPayloadConstants &deviceInfoConstants, uint32_t rootDeviceIndex) {
+    for (auto &kernelInfo : kernelInfoArray) {
+        kernelInfo->apply(deviceInfoConstants);
+    }
+
+    std::vector<KernelInfo *> validKernelInfos;
+    validKernelInfos.reserve(kernelInfoArray.size());
+    std::ranges::copy_if(kernelInfoArray,
+                         std::back_inserter(validKernelInfos),
+                         [](const KernelInfo *info) { return info->heapInfo.kernelHeapSize != 0; });
+
+    DEBUG_BREAK_IF(validKernelInfos.size() != kernelInfoArray.size());
+
+    if (validKernelInfos.empty()) {
+        return CL_SUCCESS;
+    }
+
+    if (isIsaPoolingEnabled(neoDevice)) {
+        const size_t kernelsCount = validKernelInfos.size();
+        std::vector<std::pair<size_t, size_t>> kernelsChunks(kernelsCount);
+        size_t kernelsIsaTotalSize = 0u;
+
+        for (size_t i = 0; i < kernelsCount; i++) {
+            auto chunkOffset = kernelsIsaTotalSize;
+            auto chunkSize = computeKernelIsaAllocationAlignedSizeWithPadding(neoDevice, validKernelInfos[i]->heapInfo.kernelHeapSize, ((i + 1) == kernelsCount));
+            kernelsIsaTotalSize += chunkSize;
+            kernelsChunks[i] = {chunkOffset, chunkSize};
+        }
+
+        auto &isaAllocator = neoDevice.getIsaPoolAllocator();
+        auto crossProgramAllocation = isaAllocator.requestGraphicsAllocationForIsa(isBuiltIn, kernelsIsaTotalSize);
+        if (crossProgramAllocation == nullptr) {
+            return CL_OUT_OF_HOST_MEMORY;
+        }
+        auto &sharedIsaAllocation = buildInfos[rootDeviceIndex].sharedIsaAllocation;
+        sharedIsaAllocation.reset(crossProgramAllocation);
+
+        for (size_t i = 0; i < kernelsCount; i++) {
+            auto [isaOffset, isaSize] = kernelsChunks[i];
+            validKernelInfos[i]->setIsaParentAllocation(sharedIsaAllocation->getGraphicsAllocation());
+            validKernelInfos[i]->setIsaSubAllocationOffset(sharedIsaAllocation->getOffset() + isaOffset);
+            validKernelInfos[i]->setIsaSubAllocationSize(isaSize);
+        }
+
+        if (!transferIsaSegmentsToAllocation(&neoDevice, validKernelInfos, nullptr, rootDeviceIndex)) {
+            return CL_OUT_OF_HOST_MEMORY;
+        }
+    } else {
+        for (auto &kernelInfo : validKernelInfos) {
+            if (!kernelInfo->createKernelAllocation(neoDevice, isBuiltIn)) {
+                return CL_OUT_OF_HOST_MEMORY;
+            }
+        }
+    }
+
+    return CL_SUCCESS;
+}
+
+bool Program::transferIsaSegmentsToAllocation(Device *pDevice, std::vector<KernelInfo *> &kernelInfoArray, const Linker::PatchableSegments *isaSegmentsForPatching, uint32_t rootDeviceIndex) {
+    const auto &productHelper = pDevice->getProductHelper();
+    auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment();
+
+    auto &sharedIsaAllocation = buildInfos[rootDeviceIndex].sharedIsaAllocation;
+    if (sharedIsaAllocation) {
+        const auto isaBufferSize = sharedIsaAllocation->getSize();
+        DEBUG_BREAK_IF(isaBufferSize == 0);
+
+        std::vector<std::byte> isaBuffer(isaBufferSize);
+        std::memset(isaBuffer.data(), 0x0, isaBufferSize);
+        auto programOffset = sharedIsaAllocation->getOffset();
+
+        for (auto &kernelInfo : kernelInfoArray) {
+            kernelInfo->getIsaGraphicsAllocation()->setAubWritable(true, std::numeric_limits<uint32_t>::max());
+            kernelInfo->getIsaGraphicsAllocation()->setTbxWritable(true, std::numeric_limits<uint32_t>::max());
+
+            auto [kernelHeapPtr, kernelHeapSize] = getKernelHeapPointerAndSize(kernelInfo, kernelInfoArray, isaSegmentsForPatching);
+            auto isaOffset = kernelInfo->getIsaOffsetInParentAllocation() - programOffset;
+            memcpy_s(isaBuffer.data() + isaOffset,
+                     isaBufferSize - isaOffset,
+                     kernelHeapPtr,
+                     kernelHeapSize);
+        }
+
+        auto programAllocation = sharedIsaAllocation->getGraphicsAllocation();
+        auto lock = sharedIsaAllocation->obtainSharedAllocationLock();
+
+        auto success = NEO::MemoryTransferHelper::transferMemoryToAllocation(
+            productHelper.isBlitCopyRequiredForLocalMemory(rootDeviceEnvironment, *programAllocation),
+            *pDevice,
+            programAllocation,
+            programOffset,
+            isaBuffer.data(),
+            isaBuffer.size());
+
+        if (pDevice->getDefaultEngine().commandStreamReceiver->getType() != NEO::CommandStreamReceiverType::hardware) {
+            pDevice->getDefaultEngine().commandStreamReceiver->writeMemory(*programAllocation);
+        }
+
+        return success;
+    } else {
+        if (pDevice->getMemoryManager()->isKernelBinaryReuseEnabled()) {
+            return true;
+        }
+
+        for (auto &kernelInfo : kernelInfoArray) {
+            auto [kernelHeapPtr, kernelHeapSize] = getKernelHeapPointerAndSize(kernelInfo, kernelInfoArray, isaSegmentsForPatching);
+
+            if (nullptr == kernelInfo->getIsaGraphicsAllocation() || 0 == kernelHeapSize) {
+                continue;
+            }
+
+            DEBUG_BREAK_IF(0 == kernelInfo->heapInfo.kernelHeapSize);
+            DEBUG_BREAK_IF(kernelInfo->getIsaGraphicsAllocation() == kernelInfo->getIsaParentAllocation());
+
+            bool success = MemoryTransferHelper::transferMemoryToAllocation(
+                productHelper.isBlitCopyRequiredForLocalMemory(rootDeviceEnvironment, *kernelInfo->getIsaGraphicsAllocation()),
+                *pDevice,
+                kernelInfo->getIsaGraphicsAllocation(),
+                0,
+                kernelHeapPtr,
+                kernelHeapSize);
+
+            if (!success) {
+                return false;
+            }
+        }
+    }
+
+    return true;
+}
+
+std::pair<const void *, size_t> Program::getKernelHeapPointerAndSize(KernelInfo *const &kernelInfo, std::vector<KernelInfo *> &kernelInfoArray, const Linker::PatchableSegments *isaSegmentsForPatching) {
+    if (isaSegmentsForPatching) {
+        auto &segments = *isaSegmentsForPatching;
+        auto segmentId = &kernelInfo - &kernelInfoArray[0];
+        return {segments[segmentId].hostPointer, segments[segmentId].segmentSize};
+    } else {
+        return {kernelInfo->heapInfo.pKernelHeap, static_cast<size_t>(kernelInfo->heapInfo.kernelHeapSize)};
+    }
+}
+
+size_t Program::computeKernelIsaAllocationAlignedSizeWithPadding(const Device &neoDevice, size_t isaSize, bool lastKernel) {
+    auto isaPadding = lastKernel ? neoDevice.getGfxCoreHelper().getPaddingForISAAllocation() : 0u;
+    auto kernelStartPointerAlignment = neoDevice.getGfxCoreHelper().getKernelIsaPointerAlignment();
+    auto isaAllocationSize = alignUp(isaPadding + isaSize, kernelStartPointerAlignment);
+    return isaAllocationSize;
+}
+
+GraphicsAllocation *Program::getKernelsIsaParentAllocation(uint32_t rootDeviceIndex) const {
+    if (!buildInfos[rootDeviceIndex].sharedIsaAllocation) {
+        return nullptr;
+    }
+    return buildInfos[rootDeviceIndex].sharedIsaAllocation->getGraphicsAllocation();
+}
+
 cl_int Program::setProgramSpecializationConstant(cl_uint specId, size_t specSize, const void *specValue) {
    if (!isSpirV) {
        return CL_INVALID_PROGRAM;
@@ -381,11 +551,11 @@ const char *Program::getBuildLog(uint32_t rootDeviceIndex) const {
 void Program::cleanCurrentKernelInfo(uint32_t rootDeviceIndex) {
    auto &buildInfo = buildInfos[rootDeviceIndex];
    for (auto &kernelInfo : buildInfo.kernelInfoArray) {
-        if (kernelInfo->kernelAllocation) {
+        if (kernelInfo->getIsaGraphicsAllocation()) {
            // register cache flush in all csrs where kernel allocation was used
            for (auto &engine : this->executionEnvironment.memoryManager->getRegisteredEngines(rootDeviceIndex)) {
                auto contextId = engine.osContext->getContextId();
-                if (kernelInfo->kernelAllocation->isUsedByOsContext(contextId)) {
+                if (kernelInfo->getIsaGraphicsAllocation()->isUsedByOsContext(contextId)) {
                    engine.commandStreamReceiver->registerInstructionCacheFlush();
                }
            }
@@ -403,13 +573,26 @@ void Program::cleanCurrentKernelInfo(uint32_t rootDeviceIndex) {
                    }
                }
            } else {
-                this->executionEnvironment.memoryManager->checkGpuUsageAndDestroyGraphicsAllocations(kernelInfo->kernelAllocation);
+                if (!buildInfo.sharedIsaAllocation) {
+                    this->executionEnvironment.memoryManager->checkGpuUsageAndDestroyGraphicsAllocations(kernelInfo->getIsaGraphicsAllocation());
+                }
            }
        }
        delete kernelInfo;
    }
    buildInfo.kernelInfoArray.clear();
    metadataGeneration.reset(new MetadataGeneration());
+
+    if (buildInfo.sharedIsaAllocation) {
+        for (auto &device : clDevices) {
+            auto &isaAllocator = device->getDevice().getIsaPoolAllocator();
+            if (isaAllocator.isPoolBuffer(buildInfo.sharedIsaAllocation->getGraphicsAllocation())) {
+                isaAllocator.freeSharedIsaAllocation(buildInfo.sharedIsaAllocation.release());
+                break;
+            }
+        }
+        DEBUG_BREAK_IF(buildInfo.sharedIsaAllocation);
+    }
 }

 void Program::updateNonUniformFlag() {
@@ -666,9 +849,15 @@ StackVec<NEO::GraphicsAllocation *, 32> Program::getModuleAllocations(uint32_t r
    StackVec<NEO::GraphicsAllocation *, 32> allocs;
    auto &kernelInfoArray = buildInfos[rootIndex].kernelInfoArray;

+    if (auto isaParentAllocation = this->getKernelsIsaParentAllocation(rootIndex);
+        isaParentAllocation != nullptr) {
+        allocs.push_back(isaParentAllocation);
+    } else {
        for (const auto &kernelInfo : kernelInfoArray) {
-        allocs.push_back(kernelInfo->getGraphicsAllocation());
+            allocs.push_back(kernelInfo->getIsaGraphicsAllocation());
        }
+    }
+
    GraphicsAllocation *globalsForPatching = getGlobalSurfaceGA(rootIndex);
    GraphicsAllocation *constantsForPatching = getConstantSurfaceGA(rootIndex);

--- a/opencl/source/program/program.h
+++ b/opencl/source/program/program.h
@@ -37,7 +37,9 @@ struct MetadataGeneration;
 struct KernelInfo;
 enum class DecodeError : uint8_t;
 struct ExternalFunctionInfo;
-class SharedPoolAlloction;
+class SharedPoolAllocation;
+struct DeviceInfoKernelPayloadConstants;
+class ProductHelper;

 template <>
 struct OpenCLObjectMapper<_cl_program> {
@@ -196,6 +198,31 @@ class Program : public BaseObject<_cl_program> {
    NEO::GraphicsAllocation *getGlobalSurfaceGA(uint32_t rootDeviceIndex) const;
    NEO::GraphicsAllocation *getExportedFunctionsSurface(uint32_t rootDeviceIndex) const;

+    MOCKABLE_VIRTUAL bool isIsaPoolingEnabled(Device &neoDevice);
+    cl_int setIsaGraphicsAllocations(
+        Device &neoDevice,
+        std::vector<KernelInfo *> &kernelInfoArray,
+        DeviceInfoKernelPayloadConstants &deviceInfoConstants,
+        uint32_t rootDeviceIndex);
+
+    MOCKABLE_VIRTUAL bool transferIsaSegmentsToAllocation(
+        Device *pDevice,
+        std::vector<KernelInfo *> &kernelInfoArray,
+        const Linker::PatchableSegments *isaSegmentsForPatching,
+        uint32_t rootDeviceIndex);
+
+    std::pair<const void *, size_t> getKernelHeapPointerAndSize(
+        KernelInfo *const &kernelInfo,
+        std::vector<KernelInfo *> &kernelInfoArray,
+        const Linker::PatchableSegments *isaSegmentsForPatching);
+
+    size_t computeKernelIsaAllocationAlignedSizeWithPadding(
+        const Device &neoDevice,
+        size_t isaSize,
+        bool lastKernel);
+
+    GraphicsAllocation *getKernelsIsaParentAllocation(uint32_t rootDeviceIndex) const;
+
    void cleanCurrentKernelInfo(uint32_t rootDeviceIndex);

    const std::string &getOptions() const { return options; }
@@ -331,6 +358,7 @@ class Program : public BaseObject<_cl_program> {
        std::vector<KernelInfo *> kernelInfoArray;
        std::unique_ptr<NEO::SharedPoolAllocation> constantSurface;
        std::unique_ptr<NEO::SharedPoolAllocation> globalSurface;
+        std::unique_ptr<NEO::SharedPoolAllocation> sharedIsaAllocation;
        GraphicsAllocation *exportedFunctionsSurface = nullptr;
        size_t globalVarTotalSize = 0U;
        std::unique_ptr<LinkerInput> linkerInput;
--- a/opencl/test/unit_test/aub_tests/command_queue/aub_inline_data_local_id_tests_xehp_and_later.cpp
+++ b/opencl/test/unit_test/aub_tests/command_queue/aub_inline_data_local_id_tests_xehp_and_later.cpp
@@ -379,8 +379,10 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterAubHwLocalIdsTest, WhenEnqueueDimension
    EXPECT_EQ(1u, walker->getGenerateLocalId());

    constexpr bool isHeapless = FamilyType::template isHeaplessMode<WalkerType>();
-    auto kernelAllocationGpuAddr = isHeapless ? kernel->getKernelInfo().kernelAllocation->getGpuAddress()
-                                              : kernel->getKernelInfo().kernelAllocation->getGpuAddressToPatch();
+    auto offsetInParentAllocation = kernel->getKernelInfo().getIsaOffsetInParentAllocation();
+    auto kernelAllocationGpuAddr = isHeapless ? kernel->getKernelInfo().getIsaGraphicsAllocation()->getGpuAddress() + offsetInParentAllocation
+                                              : kernel->getKernelInfo().getIsaGraphicsAllocation()->getGpuAddressToPatch() + offsetInParentAllocation;
+
    auto skipOffset = kernel->getKernelInfo().kernelDescriptor.entryPoints.skipPerThreadDataLoad;
    uint64_t kernelStartPointer = kernelAllocationGpuAddr + skipOffset;

--- a/opencl/test/unit_test/command_queue/command_queue_hw_1_tests.cpp
+++ b/opencl/test/unit_test/command_queue/command_queue_hw_1_tests.cpp
@@ -5,6 +5,7 @@
 *
 */

+#include "shared/source/utilities/buffer_pool_allocator.inl"
 #include "shared/test/common/helpers/engine_descriptor_helper.h"
 #include "shared/test/common/helpers/unit_test_helper.h"
 #include "shared/test/common/mocks/mock_allocation_properties.h"
@@ -1236,6 +1237,12 @@ HWTEST_F(CommandQueueHwTest, givenKernelSplitEnqueueReadBufferWhenBlockedThenEnq
        if (it->first == bufferAllocation) {
            expected = 3u;
        }
+
+        // Shared parent isa allocation for all kernels
+        if (pDevice->getIsaPoolAllocator().isPoolBuffer(it->first)) {
+            expected = 2u;
+        }
+
        EXPECT_EQ(expected, it->second);
    }

--- a/opencl/test/unit_test/command_queue/enqueue_svm_tests.cpp
+++ b/opencl/test/unit_test/command_queue/enqueue_svm_tests.cpp
@@ -941,6 +941,10 @@ TEST_F(EnqueueSvmTest, GivenSvmAllocationWhenEnqueingKernelThenSuccessIsReturned

 TEST_F(EnqueueSvmTest, givenEnqueueTaskBlockedOnUserEventWhenItIsEnqueuedThenSurfacesAreMadeResident) {
    USE_REAL_FILE_SYSTEM();
+
+    DebugManagerStateRestore dbgRestore;
+    debugManager.flags.EnableIsaAllocationPool.set(false);
+
    auto svmData = context->getSVMAllocsManager()->getSVMAlloc(ptrSVM);
    ASSERT_NE(nullptr, svmData);
    GraphicsAllocation *svmAllocation = svmData->gpuAllocations.getGraphicsAllocation(context->getDevice(0)->getRootDeviceIndex());
--- a/opencl/test/unit_test/fixtures/kernel_data_fixture.cpp
+++ b/opencl/test/unit_test/fixtures/kernel_data_fixture.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2023 Intel Corporation
+ * Copyright (C) 2018-2025 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@@ -116,15 +116,22 @@ void KernelDataTest::buildAndDecode() {
        EXPECT_EQ(0, memcmp(pKernelInfo->heapInfo.pSsh, pSsh, sshSize));
    }
    if (kernelHeapSize) {
-        auto kernelAllocation = pKernelInfo->getGraphicsAllocation();
+        auto kernelAllocation = pKernelInfo->getIsaGraphicsAllocation();
        UNRECOVERABLE_IF(kernelAllocation == nullptr);
        auto &device = pContext->getDevice(0)->getDevice();
        auto &helper = device.getRootDeviceEnvironment().getHelper<GfxCoreHelper>();
        size_t isaPadding = helper.getPaddingForISAAllocation();
-        EXPECT_EQ(kernelAllocation->getUnderlyingBufferSize(), kernelHeapSize + isaPadding);
+        size_t expectedIsaSize = kernelHeapSize + isaPadding;
+        if (program->getKernelsIsaParentAllocation(rootDeviceIndex)) {
+            expectedIsaSize = alignUp(expectedIsaSize, pContext->getDevice(0)->getDevice().getGfxCoreHelper().getKernelIsaPointerAlignment());
+        }
+        EXPECT_EQ(pKernelInfo->getIsaSize(), expectedIsaSize);
        auto kernelIsa = kernelAllocation->getUnderlyingBuffer();
+        if (pKernelInfo->getIsaParentAllocation()) {
+            kernelIsa = ptrOffset(kernelIsa, pKernelInfo->getIsaOffsetInParentAllocation());
+        }
        EXPECT_EQ(0, memcmp(kernelIsa, pKernelInfo->heapInfo.pKernelHeap, kernelHeapSize));
    } else {
-        EXPECT_EQ(nullptr, pKernelInfo->getGraphicsAllocation());
+        EXPECT_EQ(nullptr, pKernelInfo->getIsaGraphicsAllocation());
    }
 }
--- a/opencl/test/unit_test/fixtures/kernel_data_fixture.h
+++ b/opencl/test/unit_test/fixtures/kernel_data_fixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2023 Intel Corporation
+ * Copyright (C) 2018-2025 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@@ -60,8 +60,9 @@ class KernelDataTest : public testing::Test {
    }

    void TearDown() override {
-        if (pKernelInfo->kernelAllocation) {
-            pContext->getDevice(0)->getMemoryManager()->freeGraphicsMemory(pKernelInfo->kernelAllocation);
+        auto isaAllocation = pKernelInfo->getIsaGraphicsAllocation();
+        if (isaAllocation && !pKernelInfo->getIsaParentAllocation()) {
+            pContext->getDevice(0)->getMemoryManager()->freeGraphicsMemory(isaAllocation);
            const_cast<KernelInfo *>(pKernelInfo)->kernelAllocation = nullptr;
        }
        program.reset();
--- a/opencl/test/unit_test/gtpin/gtpin_tests.cpp
+++ b/opencl/test/unit_test/gtpin/gtpin_tests.cpp
@@ -170,6 +170,7 @@ class GTPinFixture : public ContextFixture, public MemoryManagementFixture {
  public:
    void setUp() {
        debugManager.flags.GTPinAllocateBufferInSharedMemory.set(false);
+        debugManager.flags.EnableIsaAllocationPool.set(false);
        setUpImpl();
    }

--- a/opencl/test/unit_test/kernel/kernel_tests.cpp
+++ b/opencl/test/unit_test/kernel/kernel_tests.cpp
@@ -219,7 +219,8 @@ TEST_F(KernelTests, givenBinaryWhenItIsQueriedForGpuAddressThenAbsoluteAddressIs
    EXPECT_EQ(CL_SUCCESS, retVal);

    auto gmmHelper = pDevice->getGmmHelper();
-    auto expectedGpuAddress = gmmHelper->decanonize(kernel->getKernelInfo().kernelAllocation->getGpuAddress());
+    auto &kernelInfo = kernel->getKernelInfo();
+    auto expectedGpuAddress = gmmHelper->decanonize(kernelInfo.getIsaGraphicsAllocation()->getGpuAddress() + kernelInfo.getIsaOffsetInParentAllocation());
    EXPECT_EQ(expectedGpuAddress, paramValue);
    EXPECT_EQ(paramValueSize, paramValueSizeRet);
 }
@@ -2900,11 +2901,11 @@ TEST(KernelInfoTest, givenGfxCoreHelperWhenCreatingKernelAllocationThenCorrectPa
    mockKernel->kernelInfo.heapInfo.pKernelHeap = &kernelHeap;
    mockKernel->kernelInfo.createKernelAllocation(clDevice->getDevice(), false);

-    auto graphicsAllocation = mockKernel->kernelInfo.getGraphicsAllocation();
+    auto graphicsAllocation = mockKernel->kernelInfo.getIsaGraphicsAllocation();
    auto &helper = clDevice->getRootDeviceEnvironment().getHelper<GfxCoreHelper>();
    size_t isaPadding = helper.getPaddingForISAAllocation();
    EXPECT_EQ(graphicsAllocation->getUnderlyingBufferSize(), mockKernel->kernelInfo.heapInfo.kernelHeapSize + isaPadding);
-    clDevice->getMemoryManager()->freeGraphicsMemory(mockKernel->kernelInfo.getGraphicsAllocation());
+    clDevice->getMemoryManager()->freeGraphicsMemory(mockKernel->kernelInfo.getIsaGraphicsAllocation());
 }

 TEST(KernelTest, WhenSettingKernelArgThenBuiltinDispatchInfoBuilderIsUsed) {
@@ -3469,12 +3470,12 @@ TEST(KernelTest, givenKernelLocalIdGenerationByRuntimeFalseWhenGettingStartOffse
    mockKernel.kernelInfo.kernelDescriptor.entryPoints.skipPerThreadDataLoad = 128;

    mockKernel.kernelInfo.createKernelAllocation(device->getDevice(), false);
-    auto allocationOffset = mockKernel.kernelInfo.getGraphicsAllocation()->getGpuAddressToPatch();
+    auto allocationOffset = mockKernel.kernelInfo.getIsaGraphicsAllocation()->getGpuAddressToPatch();

    mockKernel.mockKernel->setStartOffset(128);
    auto offset = mockKernel.mockKernel->getKernelStartAddress(false, true, false, false);
    EXPECT_EQ(allocationOffset + 256u, offset);
-    device->getMemoryManager()->freeGraphicsMemory(mockKernel.kernelInfo.getGraphicsAllocation());
+    device->getMemoryManager()->freeGraphicsMemory(mockKernel.kernelInfo.getIsaGraphicsAllocation());
 }

 TEST(KernelTest, givenFullAddressRequestWhenAskingForKernelStartAddressThenReturnFullAddress) {
@@ -3485,9 +3486,9 @@ TEST(KernelTest, givenFullAddressRequestWhenAskingForKernelStartAddressThenRetur
    mockKernel.kernelInfo.createKernelAllocation(device->getDevice(), false);

    auto address = mockKernel.mockKernel->getKernelStartAddress(false, true, false, true);
-    EXPECT_EQ(mockKernel.kernelInfo.getGraphicsAllocation()->getGpuAddress(), address);
+    EXPECT_EQ(mockKernel.kernelInfo.getIsaGraphicsAllocation()->getGpuAddress(), address);

-    device->getMemoryManager()->freeGraphicsMemory(mockKernel.kernelInfo.getGraphicsAllocation());
+    device->getMemoryManager()->freeGraphicsMemory(mockKernel.kernelInfo.getIsaGraphicsAllocation());
 }

 TEST(KernelTest, givenKernelLocalIdGenerationByRuntimeTrueAndLocalIdsUsedWhenGettingStartOffsetThenOffsetToSkipPerThreadDataLoadIsNotAdded) {
@@ -3498,12 +3499,12 @@ TEST(KernelTest, givenKernelLocalIdGenerationByRuntimeTrueAndLocalIdsUsedWhenGet
    mockKernel.kernelInfo.kernelDescriptor.entryPoints.skipPerThreadDataLoad = 128;

    mockKernel.kernelInfo.createKernelAllocation(device->getDevice(), false);
-    auto allocationOffset = mockKernel.kernelInfo.getGraphicsAllocation()->getGpuAddressToPatch();
+    auto allocationOffset = mockKernel.kernelInfo.getIsaGraphicsAllocation()->getGpuAddressToPatch();

    mockKernel.mockKernel->setStartOffset(128);
    auto offset = mockKernel.mockKernel->getKernelStartAddress(true, true, false, false);
    EXPECT_EQ(allocationOffset + 128u, offset);
-    device->getMemoryManager()->freeGraphicsMemory(mockKernel.kernelInfo.getGraphicsAllocation());
+    device->getMemoryManager()->freeGraphicsMemory(mockKernel.kernelInfo.getIsaGraphicsAllocation());
 }

 TEST(KernelTest, givenKernelLocalIdGenerationByRuntimeFalseAndLocalIdsNotUsedWhenGettingStartOffsetThenOffsetToSkipPerThreadDataLoadIsNotAdded) {
@@ -3514,12 +3515,12 @@ TEST(KernelTest, givenKernelLocalIdGenerationByRuntimeFalseAndLocalIdsNotUsedWhe
    mockKernel.kernelInfo.kernelDescriptor.entryPoints.skipPerThreadDataLoad = 128;

    mockKernel.kernelInfo.createKernelAllocation(device->getDevice(), false);
-    auto allocationOffset = mockKernel.kernelInfo.getGraphicsAllocation()->getGpuAddressToPatch();
+    auto allocationOffset = mockKernel.kernelInfo.getIsaGraphicsAllocation()->getGpuAddressToPatch();

    mockKernel.mockKernel->setStartOffset(128);
    auto offset = mockKernel.mockKernel->getKernelStartAddress(false, false, false, false);
    EXPECT_EQ(allocationOffset + 128u, offset);
-    device->getMemoryManager()->freeGraphicsMemory(mockKernel.kernelInfo.getGraphicsAllocation());
+    device->getMemoryManager()->freeGraphicsMemory(mockKernel.kernelInfo.getIsaGraphicsAllocation());
 }

 TEST(KernelTest, whenKernelIsInitializedThenThreadArbitrationPolicyIsSetToDefaultValue) {
@@ -4163,3 +4164,89 @@ TEST(KernelTest, whenCallingGetEnqueuedLocalWorkSizeValuesThenReturnProperValues
    EXPECT_EQ(expectedELWS[1], *(enqueuedLocalWorkSize[1]));
    EXPECT_EQ(expectedELWS[2], *(enqueuedLocalWorkSize[2]));
 }
+
+class KernelInfoIsaAllocationTest : public ::testing::Test {
+  protected:
+    void SetUp() override {
+        standaloneAllocation = std::make_unique<MockGraphicsAllocation>();
+        parentAllocation = std::make_unique<MockGraphicsAllocation>();
+        kernelInfo = std::make_unique<MockKernelInfo>();
+    }
+
+    void TearDown() override {}
+
+    std::unique_ptr<MockGraphicsAllocation> standaloneAllocation;
+    std::unique_ptr<MockGraphicsAllocation> parentAllocation;
+    std::unique_ptr<MockKernelInfo> kernelInfo;
+};
+
+TEST_F(KernelInfoIsaAllocationTest, givenStandaloneAllocationWhenQueryingIsaPropertiesThenCorrectValuesReturned) {
+    constexpr size_t allocationSize = 4096u;
+    standaloneAllocation->setSize(allocationSize);
+
+    kernelInfo->setIsaPerKernelAllocation(standaloneAllocation.get());
+
+    EXPECT_EQ(standaloneAllocation.get(), kernelInfo->getGraphicsAllocation());
+    EXPECT_EQ(standaloneAllocation.get(), kernelInfo->getIsaGraphicsAllocation());
+    EXPECT_EQ(allocationSize, kernelInfo->getIsaSize());
+    EXPECT_EQ(nullptr, kernelInfo->getIsaParentAllocation());
+    EXPECT_EQ(0u, kernelInfo->getIsaOffsetInParentAllocation());
+    EXPECT_EQ(0u, kernelInfo->getIsaSubAllocationSize());
+}
+
+TEST_F(KernelInfoIsaAllocationTest, givenParentAllocationWhenSettingIsaPropertiesThenCorrectValuesReturned) {
+    constexpr size_t offset = 1024u;
+    constexpr size_t subAllocationSize = 2048u;
+
+    kernelInfo->setIsaParentAllocation(parentAllocation.get());
+    kernelInfo->setIsaSubAllocationOffset(offset);
+    kernelInfo->setIsaSubAllocationSize(subAllocationSize);
+
+    EXPECT_EQ(parentAllocation.get(), kernelInfo->getIsaParentAllocation());
+    EXPECT_EQ(parentAllocation.get(), kernelInfo->getIsaGraphicsAllocation());
+    EXPECT_EQ(subAllocationSize, kernelInfo->getIsaSize());
+    EXPECT_EQ(offset, kernelInfo->getIsaOffsetInParentAllocation());
+    EXPECT_EQ(subAllocationSize, kernelInfo->getIsaSubAllocationSize());
+    EXPECT_EQ(nullptr, kernelInfo->kernelAllocation);
+
+    // Test updating offset and size
+    constexpr size_t newOffset = 2048u;
+    constexpr size_t newSize = 4096u;
+    kernelInfo->setIsaSubAllocationOffset(newOffset);
+    kernelInfo->setIsaSubAllocationSize(newSize);
+
+    EXPECT_EQ(newOffset, kernelInfo->getIsaOffsetInParentAllocation());
+    EXPECT_EQ(newSize, kernelInfo->getIsaSubAllocationSize());
+    EXPECT_EQ(newSize, kernelInfo->getIsaSize());
+}
+
+TEST_F(KernelInfoIsaAllocationTest, givenNoAllocationWhenQueryingIsaPropertiesThenDefaultValuesReturned) {
+    EXPECT_EQ(nullptr, kernelInfo->getGraphicsAllocation());
+    EXPECT_EQ(nullptr, kernelInfo->getIsaParentAllocation());
+    EXPECT_EQ(0u, kernelInfo->getIsaOffsetInParentAllocation());
+    EXPECT_EQ(0u, kernelInfo->getIsaSubAllocationSize());
+}
+
+TEST_F(KernelInfoIsaAllocationTest, givenTransitionBetweenAllocationTypesWhenChangingPropertiesThenCorrectStateIsMaintained) {
+    constexpr size_t standaloneSize = 4096u;
+    constexpr size_t pooledOffset = 512u;
+    constexpr size_t pooledSize = 2048u;
+
+    // Start with standalone
+    standaloneAllocation->setSize(standaloneSize);
+    kernelInfo->setIsaPerKernelAllocation(standaloneAllocation.get());
+    EXPECT_EQ(standaloneSize, kernelInfo->getIsaSize());
+    EXPECT_EQ(standaloneAllocation.get(), kernelInfo->getIsaGraphicsAllocation());
+    EXPECT_EQ(nullptr, kernelInfo->getIsaParentAllocation());
+
+    // Transition to pooled
+    kernelInfo->kernelAllocation = nullptr;
+    kernelInfo->setIsaParentAllocation(parentAllocation.get());
+    kernelInfo->setIsaSubAllocationOffset(pooledOffset);
+    kernelInfo->setIsaSubAllocationSize(pooledSize);
+
+    EXPECT_EQ(pooledSize, kernelInfo->getIsaSize());
+    EXPECT_EQ(parentAllocation.get(), kernelInfo->getIsaGraphicsAllocation());
+    EXPECT_EQ(pooledOffset, kernelInfo->getIsaOffsetInParentAllocation());
+    EXPECT_EQ(nullptr, kernelInfo->kernelAllocation);
+}
--- a/opencl/test/unit_test/mocks/mock_program.h
+++ b/opencl/test/unit_test/mocks/mock_program.h
@@ -233,9 +233,11 @@ class MockProgram : public Program {
    }

    void debugNotify(const ClDeviceVector &deviceVector, std::unordered_map<uint32_t, BuildPhase> &phasesReached) override {
+        if (callBaseDebugNotify) {
            Program::debugNotify(deviceVector, phasesReached);
            wasDebuggerNotified = true;
        }
+    }

    void callPopulateZebinExtendedArgsMetadataOnce(uint32_t rootDeviceIndex) override {
        wasPopulateZebinExtendedArgsMetadataOnceCalled = true;
@@ -245,18 +247,35 @@ class MockProgram : public Program {
        }
    }

+    bool transferIsaSegmentsToAllocation(Device *pDevice, std::vector<KernelInfo *> &kernelInfoArray, const Linker::PatchableSegments *isaSegmentsForPatching, uint32_t rootDeviceIndex) override {
+        if (transferIsaSegmentsToAllocationOverride != -1) {
+            return (transferIsaSegmentsToAllocationOverride > 0);
+        }
+        return Program::transferIsaSegmentsToAllocation(pDevice, kernelInfoArray, isaSegmentsForPatching, rootDeviceIndex);
+    }
+
+    bool isIsaPoolingEnabled(Device &neoDevice) override {
+        if (isIsaPoolingEnabledOverride != -1) {
+            return (isIsaPoolingEnabledOverride > 0);
+        }
+        return Program::isIsaPoolingEnabled(neoDevice);
+    }
+
    std::vector<NEO::ExternalFunctionInfo> externalFunctions;
    std::map<uint32_t, int> processGenBinaryCalledPerRootDevice;
    std::map<uint32_t, int> replaceDeviceBinaryCalledPerRootDevice;
    static int getInternalOptionsCalled;
    int isFlagOptionOverride = -1;
    int isOptionValueValidOverride = -1;
+    int transferIsaSegmentsToAllocationOverride = -1;
+    int isIsaPoolingEnabledOverride = -1;
    bool contextSet = false;
    bool wasProcessDebugDataCalled = false;
    bool wasCreateDebugZebinCalled = false;
    bool wasDebuggerNotified = false;
    bool wasPopulateZebinExtendedArgsMetadataOnceCalled = false;
    bool callBasePopulateZebinExtendedArgsMetadataOnce = false;
+    bool callBaseDebugNotify = true;
    auto getIntermediateRepresentation() const { return this->intermediateRepresentation; }
    auto getIsGeneratedByIgc() const { return this->isGeneratedByIgc; }
    auto &getBuildInfos() { return this->buildInfos; }
--- a/opencl/test/unit_test/os_interface/linux/cl_drm_memory_manager_tests.cpp
+++ b/opencl/test/unit_test/os_interface/linux/cl_drm_memory_manager_tests.cpp
@@ -432,6 +432,9 @@ HWTEST2_TEMPLATED_F(ClDrmMemoryManagerTest, givenDrmMemoryManagerWhenTiledImageI
        GTEST_SKIP();
    }

+    DebugManagerStateRestore dbgRestore;
+    debugManager.flags.EnableIsaAllocationPool.set(false);
+
    device->setPreemptionMode(PreemptionMode::Disabled);

    mock->ioctlExpected.gemCreate = 1;
--- a/opencl/test/unit_test/program/program_data_tests.cpp
+++ b/opencl/test/unit_test/program/program_data_tests.cpp
@@ -838,7 +838,7 @@ TEST(ProgramLinkBinaryTest, whenLinkerUnresolvedExternalThenLinkFailedAndBuildLo
    expectedUnresolvedExternals.push_back(Linker::UnresolvedExternal{relocation, 0, false});
    auto expectedError = constructLinkerErrorMessage(expectedUnresolvedExternals, std::vector<std::string>{"kernel : " + kernelInfo.kernelDescriptor.kernelMetadata.kernelName});
    EXPECT_TRUE(hasSubstr(buildLog, expectedError));
-    device->getMemoryManager()->freeGraphicsMemory(kernelInfo.getGraphicsAllocation());
+    device->getMemoryManager()->freeGraphicsMemory(kernelInfo.getIsaGraphicsAllocation());
 }

 HWTEST2_F(ProgramDataTest, whenLinkerInputValidThenIsaIsProperlyPatched, MatchAny) {
@@ -978,7 +978,7 @@ TEST(ProgramStringSectionTest, WhenConstStringBufferIsPresentThenUseItForLinking

    program.setLinkerInput(rootDeviceIndex, std::move(linkerInput));

-    auto isaCpuPtr = reinterpret_cast<char *>(kernelInfo.getGraphicsAllocation()->getUnderlyingBuffer());
+    auto isaCpuPtr = reinterpret_cast<char *>(kernelInfo.getIsaGraphicsAllocation()->getUnderlyingBuffer());
    auto patchAddr = ptrOffset(isaCpuPtr, 0x8);

    const char constStringData[] = "Hello World!\n";
--- a/opencl/test/unit_test/program/program_tests.cpp
+++ b/opencl/test/unit_test/program/program_tests.cpp
@@ -41,10 +41,12 @@
 #include "shared/test/common/mocks/mock_ail_configuration.h"
 #include "shared/test/common/mocks/mock_allocation_properties.h"
 #include "shared/test/common/mocks/mock_compiler_interface.h"
+#include "shared/test/common/mocks/mock_debugger.h"
 #include "shared/test/common/mocks/mock_elf.h"
 #include "shared/test/common/mocks/mock_graphics_allocation.h"
 #include "shared/test/common/mocks/mock_modules_zebin.h"
 #include "shared/test/common/mocks/mock_product_helper.h"
+#include "shared/test/common/mocks/mock_tbx_csr.h"
 #include "shared/test/common/mocks/mock_usm_memory_pool.h"
 #include "shared/test/common/test_macros/hw_test.h"
 #include "shared/test/common/utilities/base_object_utils.h"
@@ -511,12 +513,67 @@ TEST_F(ProgramFromBinaryTest, givenProgramWhenItIsBeingBuildThenItContainsGraphi
    pProgram->build(pProgram->getDevices(), nullptr);
    auto kernelInfo = pProgram->getKernelInfo(size_t(0), rootDeviceIndex);

-    auto graphicsAllocation = kernelInfo->getGraphicsAllocation();
+    auto graphicsAllocation = kernelInfo->getIsaGraphicsAllocation();
    ASSERT_NE(nullptr, graphicsAllocation);
    EXPECT_TRUE(graphicsAllocation->is32BitAllocation());
    auto &helper = pDevice->getRootDeviceEnvironment().getHelper<GfxCoreHelper>();
    size_t isaPadding = helper.getPaddingForISAAllocation();
+    bool isIsaPooled = (pProgram->getKernelsIsaParentAllocation(rootDeviceIndex) != nullptr);
+    if (!isIsaPooled) {
        EXPECT_EQ(graphicsAllocation->getUnderlyingBufferSize(), kernelInfo->heapInfo.kernelHeapSize + isaPadding);
+    }
+    EXPECT_EQ(kernelInfo->getIsaSize(), kernelInfo->heapInfo.kernelHeapSize + isaPadding);
+
+    auto kernelIsa = ptrOffset(graphicsAllocation->getUnderlyingBuffer(), kernelInfo->getIsaOffsetInParentAllocation());
+    EXPECT_NE(kernelInfo->heapInfo.pKernelHeap, kernelIsa);
+    EXPECT_EQ(0, memcmp(kernelIsa, kernelInfo->heapInfo.pKernelHeap, kernelInfo->heapInfo.kernelHeapSize));
+    auto rootDeviceIndex = graphicsAllocation->getRootDeviceIndex();
+    auto gmmHelper = pDevice->getGmmHelper();
+    EXPECT_EQ(gmmHelper->decanonize(graphicsAllocation->getGpuBaseAddress()), pDevice->getMemoryManager()->getInternalHeapBaseAddress(rootDeviceIndex, graphicsAllocation->isAllocatedInLocalMemoryPool()));
+}
+
+class ProgramFromBinaryIsaPoolingTest : public ProgramFromBinaryTest {
+  public:
+    void SetUp() override {
+        ProgramFromBinaryFixture::SetUp();
+        enableIsaPooling();
+    }
+
+    void TearDown() override {
+        ProgramFromBinaryFixture::TearDown();
+    }
+
+    void enableIsaPooling() {
+        pProgram->isIsaPoolingEnabledOverride = 1;
+    }
+
+    void disableIsaPooling() {
+        pProgram->isIsaPoolingEnabledOverride = 0;
+    }
+
+    void defaultIsaPooling() {
+        pProgram->isIsaPoolingEnabledOverride = -1;
+    }
+};
+
+TEST_F(ProgramFromBinaryIsaPoolingTest, givenEnabledIsaAllocationPoolWhenBuildingProgramThenIsaAllocationIsPartOfSharedParentAllocation) {
+    pProgram->build(pProgram->getDevices(), nullptr);
+
+    EXPECT_NE(nullptr, pProgram->getKernelsIsaParentAllocation(rootDeviceIndex));
+
+    auto kernelInfo = pProgram->getKernelInfo(size_t(0), rootDeviceIndex);
+
+    auto graphicsAllocation = kernelInfo->getIsaGraphicsAllocation();
+    ASSERT_NE(nullptr, graphicsAllocation);
+    EXPECT_TRUE(graphicsAllocation->is32BitAllocation());
+
+    EXPECT_TRUE(pDevice->getIsaPoolAllocator().isPoolBuffer(graphicsAllocation));
+    EXPECT_EQ(nullptr, kernelInfo->kernelAllocation);
+
+    auto expectedIsaSize = pProgram->computeKernelIsaAllocationAlignedSizeWithPadding(*pDevice, kernelInfo->heapInfo.kernelHeapSize, true);
+    EXPECT_EQ(expectedIsaSize, kernelInfo->getIsaSize());
+
+    EXPECT_EQ(0u, kernelInfo->getIsaOffsetInParentAllocation());

    auto kernelIsa = graphicsAllocation->getUnderlyingBuffer();
    EXPECT_NE(kernelInfo->heapInfo.pKernelHeap, kernelIsa);
@@ -526,6 +583,53 @@ TEST_F(ProgramFromBinaryTest, givenProgramWhenItIsBeingBuildThenItContainsGraphi
    EXPECT_EQ(gmmHelper->decanonize(graphicsAllocation->getGpuBaseAddress()), pDevice->getMemoryManager()->getInternalHeapBaseAddress(rootDeviceIndex, graphicsAllocation->isAllocatedInLocalMemoryPool()));
 }

+TEST_F(ProgramFromBinaryIsaPoolingTest, givenEnabledIsaAllocationPoolWhenBuildingProgramAndTransferIsaSegmentsToAllocationFailsThenReturnOutOfHostMemory) {
+    pProgram->transferIsaSegmentsToAllocationOverride = 0;
+    retVal = pProgram->build(pProgram->getDevices(), nullptr);
+    EXPECT_EQ(CL_OUT_OF_HOST_MEMORY, retVal);
+}
+
+TEST_F(ProgramFromBinaryIsaPoolingTest, givenEnabledIsaAllocationPoolWhenBuildingProgramAndRequestGraphicsAllocationForIsaFailsThenReturnOutOfHostMemory) {
+    auto memoryManager = static_cast<MockMemoryManager *>(pDevice->getMemoryManager());
+    memoryManager->isMockHostMemoryManager = true;
+    memoryManager->forceFailureInPrimaryAllocation = true;
+
+    retVal = pProgram->build(pProgram->getDevices(), nullptr);
+    EXPECT_EQ(CL_OUT_OF_HOST_MEMORY, retVal);
+    EXPECT_EQ(nullptr, pProgram->getKernelsIsaParentAllocation(rootDeviceIndex));
+}
+
+TEST_F(ProgramFromBinaryIsaPoolingTest, givenDisabledIsaAllocationPoolWhenBuildingProgramAndCreateKernelAllocationFailsThenReturnOutOfHostMemory) {
+    disableIsaPooling();
+
+    auto memoryManager = static_cast<MockMemoryManager *>(pDevice->getMemoryManager());
+    memoryManager->isMockHostMemoryManager = true;
+    memoryManager->forceFailureInPrimaryAllocation = true;
+
+    retVal = pProgram->build(pProgram->getDevices(), nullptr);
+    EXPECT_EQ(CL_OUT_OF_HOST_MEMORY, retVal);
+}
+
+TEST_F(ProgramFromBinaryIsaPoolingTest, givenEnabled2MBLocalMemAlignmentGTPinInitializedWhenBuildingProgramThenIsaAllocationPoolIsNotUsed) {
+    defaultIsaPooling();
+
+    pProgram->callBaseDebugNotify = false;
+
+    auto mockProductHelper = new MockProductHelper;
+    pDevice->getRootDeviceEnvironmentRef().productHelper.reset(mockProductHelper);
+    mockProductHelper->is2MBLocalMemAlignmentEnabledResult = true;
+
+    isGTPinInitialized = true;
+    pProgram->build(pProgram->getDevices(), nullptr);
+    isGTPinInitialized = false;
+
+    auto kernelInfo = pProgram->getKernelInfo(size_t(0), rootDeviceIndex);
+
+    EXPECT_EQ(nullptr, pProgram->getKernelsIsaParentAllocation(rootDeviceIndex));
+    EXPECT_NE(nullptr, kernelInfo->kernelAllocation);
+    EXPECT_FALSE(pDevice->getIsaPoolAllocator().isPoolBuffer(kernelInfo->getIsaGraphicsAllocation()));
+}
+
 TEST_F(ProgramFromBinaryTest, whenProgramIsBeingRebuildThenOutdatedGlobalBuffersAreFreed) {
    pProgram->build(pProgram->getDevices(), nullptr);
    EXPECT_EQ(nullptr, pProgram->buildInfos[pClDevice->getRootDeviceIndex()].constantSurface);
@@ -695,6 +799,33 @@ TEST_F(ProgramFromBinaryTest, givenProgramWithGlobalAndConstAllocationsWhenGetti
    pProgram->buildInfos[pClDevice->getRootDeviceIndex()].constantSurface = std::make_unique<SharedPoolAllocation>(new MockGraphicsAllocation());
    pProgram->buildInfos[pClDevice->getRootDeviceIndex()].globalSurface = std::make_unique<SharedPoolAllocation>(new MockGraphicsAllocation());

+    auto allocs = pProgram->getModuleAllocations(pClDevice->getRootDeviceIndex());
+
+    auto expectedSize = 2u + (pProgram->getKernelsIsaParentAllocation(pClDevice->getRootDeviceIndex()) ? 1u : pProgram->getNumKernels());
+    EXPECT_EQ(expectedSize, allocs.size());
+
+    auto iter = std::find(allocs.begin(), allocs.end(), pProgram->buildInfos[pClDevice->getRootDeviceIndex()].constantSurface->getGraphicsAllocation());
+    EXPECT_NE(allocs.end(), iter);
+
+    iter = std::find(allocs.begin(), allocs.end(), pProgram->buildInfos[pClDevice->getRootDeviceIndex()].globalSurface->getGraphicsAllocation());
+    EXPECT_NE(allocs.end(), iter);
+
+    if (auto isaParentAllocation = pProgram->getKernelsIsaParentAllocation(pClDevice->getRootDeviceIndex());
+        isaParentAllocation != nullptr) {
+        iter = std::find(allocs.begin(), allocs.end(), isaParentAllocation);
+        EXPECT_NE(allocs.end(), iter);
+    } else {
+        iter = std::find(allocs.begin(), allocs.end(), pProgram->buildInfos[pClDevice->getRootDeviceIndex()].kernelInfoArray[0]->getIsaGraphicsAllocation());
+        EXPECT_NE(allocs.end(), iter);
+    }
+}
+
+TEST_F(ProgramFromBinaryIsaPoolingTest, givenEnabledIsaAllocationPoolAndProgramWithGlobalAndConstAllocationsWhenGettingModuleAllocationsThenAllAreReturned) {
+    pProgram->build(pProgram->getDevices(), nullptr);
+    pProgram->processGenBinary(*pClDevice);
+    pProgram->buildInfos[pClDevice->getRootDeviceIndex()].constantSurface = std::make_unique<SharedPoolAllocation>(new MockGraphicsAllocation());
+    pProgram->buildInfos[pClDevice->getRootDeviceIndex()].globalSurface = std::make_unique<SharedPoolAllocation>(new MockGraphicsAllocation());
+
    auto allocs = pProgram->getModuleAllocations(pClDevice->getRootDeviceIndex());
    EXPECT_EQ(pProgram->getNumKernels() + 2u, allocs.size());

@@ -704,10 +835,127 @@ TEST_F(ProgramFromBinaryTest, givenProgramWithGlobalAndConstAllocationsWhenGetti
    iter = std::find(allocs.begin(), allocs.end(), pProgram->buildInfos[pClDevice->getRootDeviceIndex()].globalSurface->getGraphicsAllocation());
    EXPECT_NE(allocs.end(), iter);

-    iter = std::find(allocs.begin(), allocs.end(), pProgram->buildInfos[pClDevice->getRootDeviceIndex()].kernelInfoArray[0]->getGraphicsAllocation());
+    iter = std::find(allocs.begin(), allocs.end(), pProgram->buildInfos[pClDevice->getRootDeviceIndex()].kernelInfoArray[0]->getIsaParentAllocation());
    EXPECT_NE(allocs.end(), iter);
 }

+HWTEST_F(ProgramFromBinaryIsaPoolingTest, givenTbxModeAndPooledIsaWhenTransferringSegmentsThenWriteMemoryIsCalled) {
+    auto tbxCsr = new MockTbxCsr<FamilyType>(*pDevice->executionEnvironment, pDevice->getDeviceBitfield());
+    pDevice->resetCommandStreamReceiver(tbxCsr);
+
+    retVal = pProgram->build(pProgram->getDevices(), nullptr);
+    EXPECT_EQ(CL_SUCCESS, retVal);
+    EXPECT_NE(nullptr, pProgram->getKernelsIsaParentAllocation(rootDeviceIndex));
+
+    EXPECT_TRUE(tbxCsr->writeMemoryGfxAllocCalled);
+}
+
+class ProgramIsaPoolingEnabledTest : public ProgramFromBinaryTest {
+  public:
+    void SetUp() override {
+        ProgramFromBinaryTest::SetUp();
+        pDevice->getRootDeviceEnvironmentRef().productHelper.reset(mockProductHelper);
+    }
+
+    void TearDown() override {
+        ProgramFromBinaryTest::TearDown();
+    }
+
+    MockProductHelper *mockProductHelper = new MockProductHelper;
+    DebugManagerStateRestore restorer;
+};
+
+TEST_F(ProgramIsaPoolingEnabledTest, givenDebugFlagSetWhenCheckingIsaPoolingThenReturnCorrectValue) {
+    {
+        debugManager.flags.EnableIsaAllocationPool.set(1);
+        EXPECT_TRUE(pProgram->isIsaPoolingEnabled(*pDevice));
+    }
+    {
+        debugManager.flags.EnableIsaAllocationPool.set(0);
+        EXPECT_FALSE(pProgram->isIsaPoolingEnabled(*pDevice));
+    }
+}
+
+TEST_F(ProgramIsaPoolingEnabledTest, givenDebugFlagDefaultAnd2MBAlignmentDisabledWhenCheckingIsaPoolingThenReturnFalse) {
+    debugManager.flags.EnableIsaAllocationPool.set(-1);
+    mockProductHelper->is2MBLocalMemAlignmentEnabledResult = false;
+
+    EXPECT_FALSE(pProgram->isIsaPoolingEnabled(*pDevice));
+}
+
+TEST_F(ProgramIsaPoolingEnabledTest, givenDebugFlagDefaultAndL0DebuggerPresentWhenCheckingIsaPoolingThenReturnFalse) {
+    debugManager.flags.EnableIsaAllocationPool.set(-1);
+    mockProductHelper->is2MBLocalMemAlignmentEnabledResult = true;
+
+    pDevice->getRootDeviceEnvironmentRef().debugger.reset(new MockDebugger);
+    pDevice->setDebugger(pDevice->getRootDeviceEnvironmentRef().debugger.get());
+
+    EXPECT_FALSE(pProgram->isIsaPoolingEnabled(*pDevice));
+
+    pDevice->getRootDeviceEnvironmentRef().debugger.reset(nullptr);
+    pDevice->setDebugger(nullptr);
+}
+
+TEST_F(ProgramIsaPoolingEnabledTest, givenDebugFlagDefaultAndGTPinInitializedWhenCheckingIsaPoolingThenReturnFalse) {
+    debugManager.flags.EnableIsaAllocationPool.set(-1);
+    mockProductHelper->is2MBLocalMemAlignmentEnabledResult = true;
+
+    isGTPinInitialized = true;
+    EXPECT_FALSE(pProgram->isIsaPoolingEnabled(*pDevice));
+    isGTPinInitialized = false;
+}
+
+TEST_F(ProgramIsaPoolingEnabledTest, givenDebugFlagDefaultAndKernelBinaryReuseEnabledWhenCheckingIsaPoolingThenReturnFalse) {
+    debugManager.flags.EnableIsaAllocationPool.set(-1);
+    debugManager.flags.ReuseKernelBinaries.set(1);
+    mockProductHelper->is2MBLocalMemAlignmentEnabledResult = true;
+
+    EXPECT_FALSE(pProgram->isIsaPoolingEnabled(*pDevice));
+}
+
+TEST_F(ProgramIsaPoolingEnabledTest, givenDebugFlagDefaultAndAllConditionsMetWhenCheckingIsaPoolingThenReturnTrue) {
+    debugManager.flags.EnableIsaAllocationPool.set(-1);
+    debugManager.flags.ReuseKernelBinaries.set(0);
+    mockProductHelper->is2MBLocalMemAlignmentEnabledResult = true;
+
+    isGTPinInitialized = false;
+
+    EXPECT_TRUE(pProgram->isIsaPoolingEnabled(*pDevice));
+}
+
+TEST_F(ProgramFromBinaryTest, givenVariousIsaSizesAndKernelPositionsWhenComputingSizeThenCorrectAlignmentAndPaddingAreApplied) {
+    auto &gfxCoreHelper = pDevice->getGfxCoreHelper();
+    auto alignment = gfxCoreHelper.getKernelIsaPointerAlignment();
+    auto padding = gfxCoreHelper.getPaddingForISAAllocation();
+
+    std::vector<size_t> testSizes = {0u, 1u, 64u, 256u, 1024u, 4096u};
+
+    for (auto isaSize : testSizes) {
+        // Test last kernel (with padding)
+        auto sizeWithPadding = pProgram->computeKernelIsaAllocationAlignedSizeWithPadding(*pDevice, isaSize, true);
+        EXPECT_EQ(alignUp(isaSize + padding, alignment), sizeWithPadding);
+        EXPECT_TRUE(isAligned(sizeWithPadding, alignment));
+        EXPECT_GE(sizeWithPadding, isaSize + padding);
+
+        // Test not last kernel (without padding)
+        auto sizeWithoutPadding = pProgram->computeKernelIsaAllocationAlignedSizeWithPadding(*pDevice, isaSize, false);
+        EXPECT_EQ(alignUp(isaSize, alignment), sizeWithoutPadding);
+        EXPECT_TRUE(isAligned(sizeWithoutPadding, alignment));
+        EXPECT_GE(sizeWithoutPadding, isaSize);
+
+        // Size with padding should be >= size without padding
+        EXPECT_GE(sizeWithPadding, sizeWithoutPadding);
+    }
+
+    // Test already aligned size
+    size_t alignedIsaSize = alignment * 4;
+    auto alignedWithoutPadding = pProgram->computeKernelIsaAllocationAlignedSizeWithPadding(*pDevice, alignedIsaSize, false);
+    EXPECT_EQ(alignedIsaSize, alignedWithoutPadding);
+
+    auto alignedWithPadding = pProgram->computeKernelIsaAllocationAlignedSizeWithPadding(*pDevice, alignedIsaSize, true);
+    EXPECT_EQ(alignUp(alignedIsaSize + padding, alignment), alignedWithPadding);
+}
+
 using ProgramGetNumKernelsTest = Test<NEOProgramFixture>;
 TEST_F(ProgramGetNumKernelsTest, givenProgramWithFunctionsWhenGettingNumKernelsFunctionsAreNotExposed) {
    program->resizeAndPopulateKernelInfoArray(2);
@@ -741,13 +989,17 @@ HWTEST_F(ProgramFromBinaryTest, givenProgramWhenCleanCurrentKernelInfoIsCalledBu
    auto &csr = pDevice->getGpgpuCommandStreamReceiver();
    EXPECT_TRUE(csr.getTemporaryAllocations().peekIsEmpty());
    pProgram->build(pProgram->getDevices(), nullptr);
-    auto kernelAllocation = pProgram->getKernelInfo(static_cast<size_t>(0u), rootDeviceIndex)->getGraphicsAllocation();
+    auto kernelAllocation = pProgram->getKernelInfo(static_cast<size_t>(0u), rootDeviceIndex)->getIsaGraphicsAllocation();
+    const bool isIsaPooled = pProgram->getKernelsIsaParentAllocation(rootDeviceIndex);
    kernelAllocation->updateTaskCount(100, csr.getOsContext().getContextId());
    *csr.getTagAddress() = 0;
    pProgram->cleanCurrentKernelInfo(rootDeviceIndex);
-    EXPECT_TRUE(csr.getTemporaryAllocations().peekIsEmpty());
+    if (!isIsaPooled) {
        EXPECT_FALSE(csr.getDeferredAllocations().peekIsEmpty());
        EXPECT_EQ(csr.getDeferredAllocations().peekHead(), kernelAllocation);
+    } else {
+        EXPECT_TRUE(csr.getDeferredAllocations().peekIsEmpty());
+    }
    EXPECT_TRUE(this->pDevice->getUltCommandStreamReceiver<FamilyType>().requiresInstructionCacheFlush);
 }

@@ -757,7 +1009,7 @@ HWTEST_F(ProgramFromBinaryTest, givenIsaAllocationUsedByMultipleCsrsWhenItIsDele

    pProgram->build(pProgram->getDevices(), nullptr);

-    auto kernelAllocation = pProgram->getKernelInfo(static_cast<size_t>(0u), rootDeviceIndex)->getGraphicsAllocation();
+    auto kernelAllocation = pProgram->getKernelInfo(static_cast<size_t>(0u), rootDeviceIndex)->getIsaGraphicsAllocation();

    csr0.makeResident(*kernelAllocation);
    csr1.makeResident(*kernelAllocation);
@@ -3267,6 +3519,52 @@ TEST_F(ProgramBinTest, GivenDebugDataAvailableWhenLinkingProgramThenDebugDataIsS
    EXPECT_NE(nullptr, pProgram->getDebugData(rootDeviceIndex));
 }

+TEST_F(ProgramBinTest, givenEnabledIsaAllocationPoolWhenMultipleProgramsCreatedThenProgramsShareIsaAllocation) {
+    DebugDataGuard debugDataGuard{true};
+
+    DebugManagerStateRestore restorer;
+    debugManager.flags.EnableIsaAllocationPool.set(1);
+
+    const size_t numPrograms = 5;
+    std::vector<MockProgram *> programs;
+    NEO::GraphicsAllocation *sharedAllocation;
+
+    const char *sourceCode = "__kernel void\nCB(\n__global unsigned int* src, __global unsigned int* dst)\n{\nint id = (int)get_global_id(0);\ndst[id] = src[id];\n}\n";
+
+    for (size_t i = 0; i < numPrograms; ++i) {
+        programs.emplace_back(Program::create<MockProgram>(
+            pContext,
+            1,
+            &sourceCode,
+            &knownSourceSize,
+            retVal));
+
+        retVal = programs[i]->build(programs[i]->getDevices(), nullptr);
+        EXPECT_EQ(CL_SUCCESS, retVal);
+
+        if (i == 0) {
+            sharedAllocation = programs[i]->getKernelsIsaParentAllocation(rootDeviceIndex);
+            EXPECT_TRUE(pDevice->getIsaPoolAllocator().isPoolBuffer(sharedAllocation));
+        }
+
+        auto kernelInfoArray = programs[i]->getKernelInfoArray(pDevice->getRootDeviceIndex());
+        auto offsetForKernelInfo = kernelInfoArray[0]->getIsaOffsetInParentAllocation();
+        for (auto &kernelInfo : kernelInfoArray) {
+            EXPECT_EQ(offsetForKernelInfo, kernelInfo->getIsaOffsetInParentAllocation());
+            offsetForKernelInfo += kernelInfo->getIsaSubAllocationSize();
+        }
+
+        // Verify that all kernel infos share same parent allocation
+        if (i != 0) {
+            EXPECT_EQ(sharedAllocation, programs[i]->getKernelsIsaParentAllocation(rootDeviceIndex));
+        }
+    }
+
+    for (auto &program : programs) {
+        program->release();
+    }
+}
+
 using ProgramMultiRootDeviceTests = MultiRootDeviceFixture;

 TEST_F(ProgramMultiRootDeviceTests, WhenProgramIsCreatedThenBuildInfosVectorIsProperlyResized) {
--- a/opencl/test/unit_test/program/program_with_zebin.cpp
+++ b/opencl/test/unit_test/program/program_with_zebin.cpp
@@ -46,7 +46,14 @@ void ProgramWithZebinFixture::populateProgramWithSegments(NEO::MockProgram *prog
    kernelInfo = std::make_unique<KernelInfo>();
    kernelInfo->kernelDescriptor.kernelMetadata.kernelName = ZebinTestData::ValidEmptyProgram<>::kernelName;
    mockAlloc = std::make_unique<MockGraphicsAllocation>();
+
+    if (isUsingSharedIsaAllocation) {
+        kernelInfo->setIsaParentAllocation(mockAlloc.get());
+        kernelInfo->setIsaSubAllocationOffset(isaSubAllocOffset);
+        kernelInfo->setIsaSubAllocationSize(isaSubAllocSize);
+    } else {
        kernelInfo->kernelAllocation = mockAlloc.get();
+    }

    program->addKernelInfo(kernelInfo.get(), rootDeviceIndex);

--- a/opencl/test/unit_test/program/program_with_zebin.h
+++ b/opencl/test/unit_test/program/program_with_zebin.h
@@ -27,6 +27,9 @@ class ProgramWithZebinFixture : public ProgramTests {
    std::unique_ptr<MockBuffer> globalSurface;
    std::unique_ptr<MockBuffer> constantSurface;
    const char strings[12] = "Hello olleH";
+    bool isUsingSharedIsaAllocation = false;
+    const size_t isaSubAllocOffset = 48u;
+    const size_t isaSubAllocSize = 308u;
    void SetUp() override;
    void TearDown() override;
    void addEmptyZebin(MockProgram *program);
--- a/opencl/test/unit_test/program/program_with_zebin_tests.cpp
+++ b/opencl/test/unit_test/program/program_with_zebin_tests.cpp
@@ -42,7 +42,7 @@ TEST_F(ProgramWithZebinFixture, givenZebinSegmentsThenSegmentsArePopulated) {
    };
    checkGPUSeg(program->buildInfos[rootDeviceIndex].constantSurface->getGraphicsAllocation(), segments.constData);
    checkGPUSeg(program->buildInfos[rootDeviceIndex].globalSurface->getGraphicsAllocation(), segments.varData);
-    checkGPUSeg(program->getKernelInfoArray(rootDeviceIndex)[0]->getGraphicsAllocation(), segments.nameToSegMap[ZebinTestData::ValidEmptyProgram<>::kernelName]);
+    checkGPUSeg(program->getKernelInfoArray(rootDeviceIndex)[0]->getIsaGraphicsAllocation(), segments.nameToSegMap[ZebinTestData::ValidEmptyProgram<>::kernelName]);

    EXPECT_EQ(reinterpret_cast<uintptr_t>(program->buildInfos[rootDeviceIndex].constStringSectionData.initData), segments.stringData.address);
    EXPECT_EQ(reinterpret_cast<const char *>(program->buildInfos[rootDeviceIndex].constStringSectionData.initData), strings);
@@ -65,6 +65,33 @@ TEST_F(ProgramWithZebinFixture, givenZebinSegmentsWithSharedGlobalAndConstSurfac
    checkGPUSeg(program->buildInfos[rootDeviceIndex].globalSurface.get(), segments.varData);
 }

+TEST_F(ProgramWithZebinFixture, givenSharedIsaAllocationWhenGetZebinSegmentsThenSegmentsAreCorrectlyPopulated) {
+    isUsingSharedIsaAllocation = true;
+    populateProgramWithSegments(program.get());
+    auto segments = program->getZebinSegments(rootDeviceIndex);
+
+    auto checkGPUSeg = [](NEO::GraphicsAllocation *alloc, NEO::Zebin::Debug::Segments::Segment segment) {
+        EXPECT_EQ(static_cast<uintptr_t>(alloc->getGpuAddress()), segment.address);
+        EXPECT_EQ(static_cast<size_t>(alloc->getUnderlyingBufferSize()), segment.size);
+    };
+    checkGPUSeg(program->buildInfos[rootDeviceIndex].constantSurface->getGraphicsAllocation(), segments.constData);
+    checkGPUSeg(program->buildInfos[rootDeviceIndex].globalSurface->getGraphicsAllocation(), segments.varData);
+
+    {
+        auto kernelInfo = program->getKernelInfoArray(rootDeviceIndex)[0];
+        auto segment = segments.nameToSegMap[ZebinTestData::ValidEmptyProgram<>::kernelName];
+        auto isaAlloc = kernelInfo->getIsaGraphicsAllocation();
+        auto offset = kernelInfo->getIsaOffsetInParentAllocation();
+
+        EXPECT_EQ(static_cast<uintptr_t>(isaAlloc->getGpuAddress() + offset), segment.address);
+        EXPECT_EQ(static_cast<size_t>(kernelInfo->getIsaSubAllocationSize()), segment.size);
+    }
+
+    EXPECT_EQ(reinterpret_cast<uintptr_t>(program->buildInfos[rootDeviceIndex].constStringSectionData.initData), segments.stringData.address);
+    EXPECT_EQ(reinterpret_cast<const char *>(program->buildInfos[rootDeviceIndex].constStringSectionData.initData), strings);
+    EXPECT_EQ(program->buildInfos[rootDeviceIndex].constStringSectionData.size, sizeof(strings));
+}
+
 TEST_F(ProgramWithZebinFixture, givenNonEmptyDebugDataThenDebugZebinIsNotCreated) {
    addEmptyZebin(program.get());
    populateProgramWithSegments(program.get());
--- a/opencl/test/unit_test/xe2_hpg_core/enqueue_tests_xe2_hpg_core.cpp
+++ b/opencl/test/unit_test/xe2_hpg_core/enqueue_tests_xe2_hpg_core.cpp
@@ -45,7 +45,7 @@ struct EnqueueFixtureXe2HpgCore : public ::testing::Test {
    }

    void TearDown() override {
-        clDevice->getMemoryManager()->freeGraphicsMemory(mockKernel->kernelInfo.getGraphicsAllocation());
+        clDevice->getMemoryManager()->freeGraphicsMemory(mockKernel->kernelInfo.getIsaGraphicsAllocation());
    }

    template <typename FamilyType>
@@ -92,7 +92,7 @@ XE2_HPG_CORETEST_F(MemoryPrefetchTestsXe2HpgCore, givenKernelWhenWalkerIsProgram

    auto gmmHelper = clDevice->getRootDeviceEnvironment().getGmmHelper();

-    EXPECT_EQ(gmmHelper->decanonize(mockKernel->kernelInfo.getGraphicsAllocation()->getGpuAddress()), statePrefetchCmd->getAddress());
+    EXPECT_EQ(gmmHelper->decanonize(mockKernel->kernelInfo.getIsaGraphicsAllocation()->getGpuAddress()), statePrefetchCmd->getAddress());
    EXPECT_TRUE(statePrefetchCmd->getKernelInstructionPrefetch());
 }

--- a/opencl/test/unit_test/xe_hpc_core/enqueue_tests_xe_hpc_core.cpp
+++ b/opencl/test/unit_test/xe_hpc_core/enqueue_tests_xe_hpc_core.cpp
@@ -45,7 +45,7 @@ struct EnqueueFixtureXeHpcCore : public ::testing::Test {
    }

    void TearDown() override {
-        clDevice->getMemoryManager()->freeGraphicsMemory(mockKernel->kernelInfo.getGraphicsAllocation());
+        clDevice->getMemoryManager()->freeGraphicsMemory(mockKernel->kernelInfo.getIsaGraphicsAllocation());
    }

    template <typename FamilyType>
@@ -90,7 +90,7 @@ XE_HPC_CORETEST_F(MemoryPrefetchTestsXeHpcCore, givenKernelWhenWalkerIsProgramme
    auto statePrefetchCmd = genCmdCast<STATE_PREFETCH *>(*itorStatePrefetch);
    EXPECT_NE(nullptr, statePrefetchCmd);

-    EXPECT_EQ(mockKernel->kernelInfo.getGraphicsAllocation()->getGpuAddress(), statePrefetchCmd->getAddress());
+    EXPECT_EQ(mockKernel->kernelInfo.getIsaGraphicsAllocation()->getGpuAddress(), statePrefetchCmd->getAddress());
    EXPECT_TRUE(statePrefetchCmd->getKernelInstructionPrefetch());
 }

--- a/shared/source/debug_settings/debug_variables_base.inl
+++ b/shared/source/debug_settings/debug_variables_base.inl
@@ -440,6 +440,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, EnableDeviceUsmAllocationPool, -1, "-1: default
 DECLARE_DEBUG_VARIABLE(int32_t, EnableHostUsmAllocationPool, -1, "-1: default (enabled, 2MB), 0: disabled, >=1: enabled, size in MB")
 DECLARE_DEBUG_VARIABLE(int32_t, EnableUsmAllocationPoolManager, -1, "-1: default, 0: disabled, 1: enabled, use growing pools")
 DECLARE_DEBUG_VARIABLE(int32_t, EnableUsmPoolResidencyTracking, -1, "-1: default, 0: disabled, 1: enabled, track residency per chunk")
+DECLARE_DEBUG_VARIABLE(int32_t, EnableIsaAllocationPool, -1, "-1: default, 0: disabled, 1: enabled")
 DECLARE_DEBUG_VARIABLE(int32_t, EnableUsmPoolLazyInit, -1, "-1: default, 0: disabled, 1: enabled, initialize usm pools on first alloc")
 DECLARE_DEBUG_VARIABLE(int32_t, UseLocalPreferredForCacheableBuffers, -1, "Use localPreferred for cacheable buffers")
 DECLARE_DEBUG_VARIABLE(int32_t, EnableCopyWithStagingBuffers, -1, "Enable copy with non-usm memory through staging buffers. -1: default, 0: disabled, 1: enabled")
--- a/shared/source/program/kernel_info.cpp
+++ b/shared/source/program/kernel_info.cpp
@@ -120,6 +120,31 @@ void KernelInfo::apply(const DeviceInfoKernelPayloadConstants &constants) {
    setIfValidOffset(constants.maxWorkGroupSize, implicitArgs.maxWorkGroupSize);
 }

+GraphicsAllocation *KernelInfo::getIsaGraphicsAllocation() const {
+    if (auto allocation = this->getIsaParentAllocation(); allocation != nullptr) {
+        DEBUG_BREAK_IF(this->kernelAllocation != nullptr);
+        return allocation;
+    } else {
+        DEBUG_BREAK_IF(this->kernelAllocation == nullptr);
+        return this->kernelAllocation;
+    }
+}
+
+uint32_t KernelInfo::getIsaSize() const {
+    if (this->getIsaParentAllocation()) {
+        DEBUG_BREAK_IF(this->kernelAllocation != nullptr);
+        return static_cast<uint32_t>(this->isaSubAllocationSize);
+    } else {
+        DEBUG_BREAK_IF(this->kernelAllocation == nullptr);
+        return static_cast<uint32_t>(this->kernelAllocation->getUnderlyingBufferSize());
+    }
+}
+
+void KernelInfo::setIsaPerKernelAllocation(GraphicsAllocation *allocation) {
+    DEBUG_BREAK_IF(this->isaParentAllocation != nullptr);
+    this->kernelAllocation = allocation;
+}
+
 std::string concatenateKernelNames(ArrayRef<KernelInfo *> kernelInfos) {
    std::string semiColonDelimitedKernelNameStr;

--- a/shared/source/program/kernel_info.h
+++ b/shared/source/program/kernel_info.h
@@ -83,6 +83,30 @@ struct KernelInfo : NEO::NonCopyableAndNonMovableClass {
    bool createKernelAllocation(const Device &device, bool internalIsa);
    void apply(const DeviceInfoKernelPayloadConstants &constants);

+    uint32_t getIsaSize() const;
+    GraphicsAllocation *getIsaGraphicsAllocation() const;
+    void setIsaPerKernelAllocation(GraphicsAllocation *allocation);
+
+    inline GraphicsAllocation *getIsaParentAllocation() const {
+        return isaParentAllocation;
+    }
+    inline void setIsaParentAllocation(GraphicsAllocation *allocation) {
+        isaParentAllocation = allocation;
+    }
+    inline size_t getIsaOffsetInParentAllocation() const {
+        DEBUG_BREAK_IF(this->kernelAllocation != nullptr && 0u != isaSubAllocationOffset);
+        return isaSubAllocationOffset;
+    }
+    inline void setIsaSubAllocationOffset(size_t offset) {
+        isaSubAllocationOffset = offset;
+    }
+    inline void setIsaSubAllocationSize(size_t size) {
+        isaSubAllocationSize = size;
+    }
+    inline size_t getIsaSubAllocationSize() const {
+        return isaSubAllocationSize;
+    }
+
    HeapInfo heapInfo = {};
    std::vector<std::pair<uint32_t, uint32_t>> childrenKernelsIdOffset;
    char *crossThreadData = nullptr;
@@ -97,6 +121,11 @@ struct KernelInfo : NEO::NonCopyableAndNonMovableClass {

    uint64_t shaderHashCode;
    KernelDescriptor kernelDescriptor;
+
+  private:
+    GraphicsAllocation *isaParentAllocation = nullptr;
+    size_t isaSubAllocationOffset = 0lu;
+    size_t isaSubAllocationSize = 0lu;
 };

 static_assert(NEO::NonCopyableAndNonMovable<KernelInfo>);
--- a/shared/test/common/test_files/igdrcl.config
+++ b/shared/test/common/test_files/igdrcl.config
@@ -593,6 +593,7 @@ EnableUserFenceUponUnbind = -1
 EnableWaitOnUserFenceAfterBindAndUnbind = -1
 UseGemCreateExtInAllocateMemoryByKMD = -1
 PrintMmapAndMunMapCalls = -1
+EnableIsaAllocationPool = -1
 UseLocalPreferredForCacheableBuffers = -1
 EnableFtrTile64Optimization = 0
 ForceTlbFlushWithTaskCountAfterCopy = -1