compute-runtime/unit_tests/execution_model/enqueue_execution_model_ker...

/*
 * Copyright (C) 2017-2018 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
 */

#include "runtime/command_queue/gpgpu_walker.h"
#include "runtime/command_queue/local_id_gen.h"
#include "runtime/device_queue/device_queue_hw.h"
#include "runtime/event/user_event.h"
#include "runtime/helpers/per_thread_data.h"
#include "runtime/kernel/kernel.h"
#include "runtime/builtin_kernels_simulation/scheduler_simulation.h"
#include "unit_tests/fixtures/device_host_queue_fixture.h"
#include "unit_tests/fixtures/execution_model_fixture.h"
#include "unit_tests/helpers/debug_manager_state_restore.h"
#include "unit_tests/helpers/gtest_helpers.h"
#include "unit_tests/helpers/hw_parse.h"
#include "unit_tests/mocks/mock_csr.h"
#include "unit_tests/mocks/mock_device_queue.h"
#include "unit_tests/mocks/mock_event.h"
#include "unit_tests/mocks/mock_mdi.h"
#include "unit_tests/mocks/mock_submissions_aggregator.h"

using namespace OCLRT;

static const char *binaryFile = "simple_block_kernel";
static const char *KernelNames[] = {"kernel_reflection", "simple_block_kernel"};

typedef ExecutionModelKernelTest ParentKernelEnqueueTest;

HWCMDTEST_P(IGFX_GEN8_CORE, ParentKernelEnqueueTest, givenParentKernelWhenEnqueuedThenDeviceQueueDSHHasCorrectlyFilledInterfaceDesriptorTables) {
    using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;

    if (std::string(pPlatform->getDevice(0)->getDeviceInfo().clVersion).find("OpenCL 2.") != std::string::npos) {
        DeviceQueueHw<FamilyType> *pDevQueueHw = castToObject<DeviceQueueHw<FamilyType>>(pDevQueue);

        const size_t globalOffsets[3] = {0, 0, 0};
        const size_t workItems[3] = {1, 1, 1};

        pKernel->createReflectionSurface();

        BlockKernelManager *blockManager = pProgram->getBlockKernelManager();
        uint32_t blockCount = static_cast<uint32_t>(blockManager->getCount());

        auto *executionModelDshAllocation = pDevQueueHw->getDshBuffer();
        void *executionModelDsh = executionModelDshAllocation->getUnderlyingBuffer();

        EXPECT_NE(nullptr, executionModelDsh);

        INTERFACE_DESCRIPTOR_DATA *idData = static_cast<INTERFACE_DESCRIPTOR_DATA *>(ptrOffset(executionModelDsh, DeviceQueue::colorCalcStateSize));

        size_t executionModelDSHUsedBefore = pDevQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE)->getUsed();
        uint32_t colorCalcSize = DeviceQueue::colorCalcStateSize;
        EXPECT_EQ(colorCalcSize, executionModelDSHUsedBefore);

        MockMultiDispatchInfo multiDispatchInfo(pKernel);

        auto graphicsAllocation = pKernel->getKernelInfo().getGraphicsAllocation();
        auto kernelIsaAddress = graphicsAllocation->getGpuAddressToPatch();

        uint64_t lowPart = kernelIsaAddress & 0xffffffff;
        uint64_t hightPart = (kernelIsaAddress & 0xffffffff00000000) >> 32;

        pCmdQ->enqueueKernel(pKernel, 1, globalOffsets, workItems, workItems, 0, nullptr, nullptr);

        if (pKernel->getKernelInfo().name == "kernel_reflection") {
            EXPECT_NE(0u, idData[0].getSamplerCount());
            EXPECT_NE(0u, idData[0].getSamplerStatePointer());
        }

        EXPECT_NE(0u, idData[0].getConstantIndirectUrbEntryReadLength());
        EXPECT_NE(0u, idData[0].getCrossThreadConstantDataReadLength());
        EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::DENORM_MODE_SETBYKERNEL, idData[0].getDenormMode());
        EXPECT_EQ((uint32_t)lowPart, idData[0].getKernelStartPointer());
        EXPECT_EQ((uint32_t)hightPart, idData[0].getKernelStartPointerHigh());

        const uint32_t blockFirstIndex = 1;

        for (uint32_t i = 0; i < blockCount; i++) {
            const KernelInfo *pBlockInfo = blockManager->getBlockKernelInfo(i);

            ASSERT_NE(nullptr, pBlockInfo);
            ASSERT_NE(nullptr, pBlockInfo->patchInfo.dataParameterStream);
            ASSERT_NE(nullptr, pBlockInfo->patchInfo.executionEnvironment);
            ASSERT_NE(nullptr, pBlockInfo->patchInfo.threadPayload);

            const uint32_t sizeCrossThreadData = pBlockInfo->patchInfo.dataParameterStream->DataParameterStreamSize / sizeof(GRF);

            auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*pBlockInfo->patchInfo.threadPayload);
            auto sizePerThreadData = getPerThreadSizeLocalIDs(pBlockInfo->patchInfo.executionEnvironment->LargestCompiledSIMDSize, numChannels);
            uint32_t numGrfPerThreadData = static_cast<uint32_t>(sizePerThreadData / sizeof(GRF));
            numGrfPerThreadData = std::max(numGrfPerThreadData, 1u);

            EXPECT_EQ(numGrfPerThreadData, idData[blockFirstIndex + i].getConstantIndirectUrbEntryReadLength());
            EXPECT_EQ(sizeCrossThreadData, idData[blockFirstIndex + i].getCrossThreadConstantDataReadLength());
            EXPECT_NE((uint64_t)0u, ((uint64_t)idData[blockFirstIndex + i].getKernelStartPointerHigh() << 32) | (uint64_t)idData[blockFirstIndex + i].getKernelStartPointer());

            uint64_t kernelAddress = ((uint64_t)idData[blockFirstIndex + i].getKernelStartPointerHigh() << 32) | (uint64_t)idData[blockFirstIndex + i].getKernelStartPointer();
            EXPECT_EQ(pBlockInfo->getGraphicsAllocation()->getGpuAddressToPatch(), kernelAddress);
        }
    }
}

HWTEST_P(ParentKernelEnqueueTest, GivenParentKernelWithPrivateSurfaceWhenEnqueueKernelCalledThenResidencyCountIncreased) {
    if (pDevice->getSupportedClVersion() >= 20) {
        size_t offset[3] = {0, 0, 0};
        size_t gws[3] = {1, 1, 1};
        int32_t executionStamp = 0;
        auto mockCSR = new MockCsr<FamilyType>(executionStamp, *pDevice->executionEnvironment);
        pDevice->resetCommandStreamReceiver(mockCSR);
        GraphicsAllocation *privateSurface = mockCSR->getMemoryManager()->allocateGraphicsMemoryWithProperties(MockAllocationProperties{MemoryConstants::pageSize});

        pKernel->getProgram()->getBlockKernelManager()->pushPrivateSurface(privateSurface, 0);
        pCmdQ->enqueueKernel(pKernel, 1, offset, gws, gws, 0, nullptr, nullptr);

        EXPECT_TRUE(privateSurface->isResident(mockCSR->getOsContext().getContextId()));
    }
}

HWTEST_P(ParentKernelEnqueueTest, GivenBlocksWithPrivateMemoryWhenEnqueueKernelThatIsBlockedByUserEventIsCalledThenPrivateAllocationIsMadeResidentWhenEventUnblocks) {
    if (pDevice->getSupportedClVersion() >= 20) {
        size_t offset[3] = {0, 0, 0};
        size_t gws[3] = {1, 1, 1};

        auto blockKernelManager = pKernel->getProgram()->getBlockKernelManager();
        auto &csr = pDevice->getUltCommandStreamReceiver<FamilyType>();
        csr.storeMakeResidentAllocations = true;

        auto privateAllocation = csr.getMemoryManager()->allocateGraphicsMemoryWithProperties(MockAllocationProperties{MemoryConstants::pageSize});
        blockKernelManager->pushPrivateSurface(privateAllocation, 0);

        UserEvent uEvent(pContext);
        auto clEvent = static_cast<cl_event>(&uEvent);

        pCmdQ->enqueueKernel(pKernel, 1, offset, gws, gws, 1, &clEvent, nullptr);

        EXPECT_FALSE(csr.isMadeResident(privateAllocation));
        uEvent.setStatus(CL_COMPLETE);
        EXPECT_TRUE(csr.isMadeResident(privateAllocation));
    }
}

HWTEST_P(ParentKernelEnqueueTest, GivenParentKernelWithBlocksWhenEnqueueKernelIsCalledThenBlockKernelIsaAllocationIsMadeResident) {
    if (pDevice->getSupportedClVersion() >= 20) {
        size_t offset[3] = {0, 0, 0};
        size_t gws[3] = {1, 1, 1};

        auto blockKernelManager = pKernel->getProgram()->getBlockKernelManager();
        auto &csr = pDevice->getUltCommandStreamReceiver<FamilyType>();
        csr.storeMakeResidentAllocations = true;

        pCmdQ->enqueueKernel(pKernel, 1, offset, gws, gws, 0, nullptr, nullptr);

        auto blockCount = blockKernelManager->getCount();
        for (auto blockId = 0u; blockId < blockCount; blockId++) {
            EXPECT_TRUE(csr.isMadeResident(blockKernelManager->getBlockKernelInfo(blockId)->getGraphicsAllocation()));
        }
    }
}

HWTEST_P(ParentKernelEnqueueTest, GivenBlockKernelManagerFilledWithBlocksWhenMakeInternalAllocationsResidentIsCalledThenAllSurfacesAreMadeResident) {
    if (pDevice->getSupportedClVersion() >= 20) {
        auto blockKernelManager = pKernel->getProgram()->getBlockKernelManager();
        auto &csr = pDevice->getUltCommandStreamReceiver<FamilyType>();
        csr.storeMakeResidentAllocations = true;

        blockKernelManager->makeInternalAllocationsResident(csr);

        auto blockCount = blockKernelManager->getCount();
        for (auto blockId = 0u; blockId < blockCount; blockId++) {
            EXPECT_TRUE(csr.isMadeResident(blockKernelManager->getBlockKernelInfo(blockId)->getGraphicsAllocation()));
        }
    }
}

HWTEST_P(ParentKernelEnqueueTest, GivenParentKernelWithBlocksWhenEnqueueKernelThatIsBlockedByUserEventIsCalledThenBlockKernelIsaAllocationIsMadeResidentWhenEventUnblocks) {
    if (pDevice->getSupportedClVersion() >= 20) {
        size_t offset[3] = {0, 0, 0};
        size_t gws[3] = {1, 1, 1};

        auto blockKernelManager = pKernel->getProgram()->getBlockKernelManager();
        auto &csr = pDevice->getUltCommandStreamReceiver<FamilyType>();
        csr.storeMakeResidentAllocations = true;

        UserEvent uEvent(pContext);
        auto clEvent = static_cast<cl_event>(&uEvent);

        pCmdQ->enqueueKernel(pKernel, 1, offset, gws, gws, 1, &clEvent, nullptr);

        auto blockCount = blockKernelManager->getCount();
        for (auto blockId = 0u; blockId < blockCount; blockId++) {
            EXPECT_FALSE(csr.isMadeResident(blockKernelManager->getBlockKernelInfo(blockId)->getGraphicsAllocation()));
        }

        uEvent.setStatus(CL_COMPLETE);

        for (auto blockId = 0u; blockId < blockCount; blockId++) {
            EXPECT_TRUE(csr.isMadeResident(blockKernelManager->getBlockKernelInfo(blockId)->getGraphicsAllocation()));
        }
    }
}

HWTEST_P(ParentKernelEnqueueTest, givenParentKernelWhenEnqueuedSecondTimeThenDeviceQueueDSHIsResetToInitialOffset) {
    using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;

    if (std::string(pPlatform->getDevice(0)->getDeviceInfo().clVersion).find("OpenCL 2.") != std::string::npos) {
        DeviceQueueHw<FamilyType> *pDevQueueHw = castToObject<DeviceQueueHw<FamilyType>>(pDevQueue);

        const size_t globalOffsets[3] = {0, 0, 0};
        const size_t workItems[3] = {1, 1, 1};

        auto dsh = pDevQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE);
        size_t executionModelDSHUsedBefore = dsh->getUsed();

        uint32_t colorCalcSize = DeviceQueue::colorCalcStateSize;
        EXPECT_EQ(colorCalcSize, executionModelDSHUsedBefore);

        MockMultiDispatchInfo multiDispatchInfo(pKernel);

        pCmdQ->enqueueKernel(pKernel, 1, globalOffsets, workItems, workItems, 0, nullptr, nullptr);

        size_t executionModelDSHUsedAfterFirst = dsh->getUsed();
        EXPECT_LT(executionModelDSHUsedBefore, executionModelDSHUsedAfterFirst);

        pDevQueueHw->resetDeviceQueue();

        pCmdQ->enqueueKernel(pKernel, 1, globalOffsets, workItems, workItems, 0, nullptr, nullptr);

        size_t executionModelDSHUsedAfterSecond = dsh->getUsed();
        EXPECT_EQ(executionModelDSHUsedAfterFirst, executionModelDSHUsedAfterSecond);
    }
}

HWTEST_P(ParentKernelEnqueueTest, givenParentKernelWhenEnqueuedThenBlocksSurfaceStatesAreCopied) {
    using BINDING_TABLE_STATE = typename FamilyType::BINDING_TABLE_STATE;
    using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE;
    using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;

    if (std::string(pPlatform->getDevice(0)->getDeviceInfo().clVersion).find("OpenCL 2.") != std::string::npos) {

        const size_t globalOffsets[3] = {0, 0, 0};
        const size_t workItems[3] = {1, 1, 1};

        pKernel->createReflectionSurface();

        BlockKernelManager *blockManager = pProgram->getBlockKernelManager();
        uint32_t blockCount = static_cast<uint32_t>(blockManager->getCount());

        size_t parentKernelSSHSize = pKernel->getSurfaceStateHeapSize();

        MockMultiDispatchInfo multiDispatchInfo(pKernel);

        auto ssh = &getIndirectHeap<FamilyType, IndirectHeap::SURFACE_STATE>(*pCmdQ, multiDispatchInfo);
        // prealign the ssh so that it won't need to be realigned in enqueueKernel
        // this way, we can assume the location in memory into which the surface states
        // will be coies
        ssh->align(BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);

        // mark the assumed place for surface states
        size_t parentSshOffset = ssh->getUsed();

        pCmdQ->enqueueKernel(pKernel, 1, globalOffsets, workItems, workItems, 0, nullptr, nullptr);

        void *blockSSH = ptrOffset(ssh->getCpuBase(), parentSshOffset + parentKernelSSHSize); // note : unaligned at this point

        for (uint32_t i = 0; i < blockCount; i++) {
            const KernelInfo *pBlockInfo = blockManager->getBlockKernelInfo(i);

            ASSERT_NE(nullptr, pBlockInfo);
            ASSERT_NE(nullptr, pBlockInfo->patchInfo.dataParameterStream);
            ASSERT_NE(nullptr, pBlockInfo->patchInfo.executionEnvironment);
            ASSERT_NE(nullptr, pBlockInfo->patchInfo.threadPayload);

            Kernel *blockKernel = Kernel::create(pKernel->getProgram(), *pBlockInfo, nullptr);
            blockSSH = alignUp(blockSSH, BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);
            if (blockKernel->getNumberOfBindingTableStates() > 0) {
                ASSERT_NE(nullptr, pBlockInfo->patchInfo.bindingTableState);
                auto dstBlockBti = ptrOffset(blockSSH, pBlockInfo->patchInfo.bindingTableState->Offset);
                EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(dstBlockBti) % INTERFACE_DESCRIPTOR_DATA::BINDINGTABLEPOINTER_ALIGN_SIZE);
                auto dstBindingTable = reinterpret_cast<BINDING_TABLE_STATE *>(dstBlockBti);

                auto srcBlockBti = ptrOffset(pBlockInfo->heapInfo.pSsh, pBlockInfo->patchInfo.bindingTableState->Offset);
                auto srcBindingTable = reinterpret_cast<BINDING_TABLE_STATE *>(srcBlockBti);
                for (uint32_t i = 0; i < blockKernel->getNumberOfBindingTableStates(); ++i) {
                    uint32_t dstSurfaceStatePointer = dstBindingTable[i].getSurfaceStatePointer();
                    uint32_t srcSurfaceStatePointer = srcBindingTable[i].getSurfaceStatePointer();
                    auto *dstSurfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(ssh->getCpuBase(), dstSurfaceStatePointer));
                    auto *srcSurfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(pBlockInfo->heapInfo.pSsh, srcSurfaceStatePointer));
                    EXPECT_EQ(0, memcmp(srcSurfaceState, dstSurfaceState, sizeof(RENDER_SURFACE_STATE)));
                }

                blockSSH = ptrOffset(blockSSH, blockKernel->getSurfaceStateHeapSize());
            }

            delete blockKernel;
        }
    }
}

HWTEST_P(ParentKernelEnqueueTest, givenParentKernelWhenEnqueuedThenReflectionSurfaceIsCreated) {
    using BINDING_TABLE_STATE = typename FamilyType::BINDING_TABLE_STATE;
    using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE;
    using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;

    if (std::string(pPlatform->getDevice(0)->getDeviceInfo().clVersion).find("OpenCL 2.") != std::string::npos) {

        const size_t globalOffsets[3] = {0, 0, 0};
        const size_t workItems[3] = {1, 1, 1};

        MockMultiDispatchInfo multiDispatchInfo(pKernel);
        pCmdQ->enqueueKernel(pKernel, 1, globalOffsets, workItems, workItems, 0, nullptr, nullptr);

        EXPECT_NE(nullptr, pKernel->getKernelReflectionSurface());
    }
}

HWCMDTEST_P(IGFX_GEN8_CORE, ParentKernelEnqueueTest, givenBlockedQueueWhenParentKernelIsEnqueuedThenDeviceQueueIsNotReset) {
    if (std::string(pPlatform->getDevice(0)->getDeviceInfo().clVersion).find("OpenCL 2.") != std::string::npos) {

        const size_t globalOffsets[3] = {0, 0, 0};
        const size_t workItems[3] = {1, 1, 1};
        cl_queue_properties properties[3] = {0};

        MockMultiDispatchInfo multiDispatchInfo(pKernel);
        MockDeviceQueueHw<FamilyType> mockDevQueue(context, pDevice, properties[0]);

        context->setDefaultDeviceQueue(&mockDevQueue);
        // Acquire CS to check if reset queue was called
        mockDevQueue.acquireEMCriticalSection();

        MockEvent<UserEvent> mockEvent(context);

        cl_event eventBlocking = &mockEvent;

        pCmdQ->enqueueKernel(pKernel, 1, globalOffsets, workItems, workItems, 1, &eventBlocking, nullptr);

        EXPECT_FALSE(mockDevQueue.isEMCriticalSectionFree());
    }
}

HWCMDTEST_P(IGFX_GEN8_CORE, ParentKernelEnqueueTest, givenNonBlockedQueueWhenParentKernelIsEnqueuedThenDeviceQueueDSHAddressIsProgrammedInStateBaseAddressAndDSHIsMadeResident) {
    typedef typename FamilyType::STATE_BASE_ADDRESS STATE_BASE_ADDRESS;

    if (std::string(pPlatform->getDevice(0)->getDeviceInfo().clVersion).find("OpenCL 2.") != std::string::npos) {
        DeviceQueueHw<FamilyType> *pDevQueueHw = castToObject<DeviceQueueHw<FamilyType>>(pDevQueue);
        ASSERT_NE(nullptr, pDevQueueHw);

        const size_t globalOffsets[3] = {0, 0, 0};
        const size_t workItems[3] = {1, 1, 1};

        MockMultiDispatchInfo multiDispatchInfo(pKernel);

        int32_t executionStamp = 0;
        auto mockCSR = new MockCsrBase<FamilyType>(executionStamp, *pDevice->executionEnvironment);
        pDevice->resetCommandStreamReceiver(mockCSR);

        pCmdQ->enqueueKernel(pKernel, 1, globalOffsets, workItems, workItems, 0, nullptr, nullptr);

        auto &cmdStream = mockCSR->getCS(0);

        HardwareParse hwParser;
        hwParser.parseCommands<FamilyType>(cmdStream, 0);
        hwParser.findHardwareCommands<FamilyType>();

        auto stateBaseAddressItor = hwParser.itorStateBaseAddress;

        ASSERT_NE(hwParser.cmdList.end(), stateBaseAddressItor);

        auto *stateBaseAddress = (STATE_BASE_ADDRESS *)*stateBaseAddressItor;

        uint64_t addressProgrammed = stateBaseAddress->getDynamicStateBaseAddress();

        EXPECT_EQ(addressProgrammed, pDevQueue->getDshBuffer()->getGpuAddress());

        bool dshAllocationResident = false;

        for (uint32_t i = 0; i < mockCSR->madeResidentGfxAllocations.size(); i++) {
            if (mockCSR->madeResidentGfxAllocations[i] == pDevQueue->getDshBuffer()) {
                dshAllocationResident = true;
                break;
            }
        }
        EXPECT_TRUE(dshAllocationResident);
    }
}

INSTANTIATE_TEST_CASE_P(ParentKernelEnqueueTest,
                        ParentKernelEnqueueTest,
                        ::testing::Combine(
                            ::testing::Values(binaryFile),
                            ::testing::ValuesIn(KernelNames)));

class ParentKernelEnqueueFixture : public ExecutionModelSchedulerTest,
                                   public testing::Test {
    void SetUp() override {
        ExecutionModelSchedulerTest::SetUp();
    }

    void TearDown() override {
        ExecutionModelSchedulerTest::TearDown();
    }
};

TEST_F(ParentKernelEnqueueFixture, GivenParentKernelWhenEnqueuedTheDefaultDeviceQueueAndEventPoolIsPatched) {

    if (pDevice->getSupportedClVersion() >= 20) {
        size_t offset[3] = {0, 0, 0};
        size_t gws[3] = {1, 1, 1};

        pCmdQ->enqueueKernel(parentKernel, 1, offset, gws, gws, 0, nullptr, nullptr);

        const auto &patchInfo = parentKernel->getKernelInfo().patchInfo;

        if (patchInfo.pAllocateStatelessDefaultDeviceQueueSurface) {
            auto patchLocation = ptrOffset(reinterpret_cast<uint64_t *>(parentKernel->getCrossThreadData()),
                                           patchInfo.pAllocateStatelessDefaultDeviceQueueSurface->DataParamOffset);

            EXPECT_EQ(pDevQueue->getQueueBuffer()->getGpuAddress(), *patchLocation);
        }

        if (patchInfo.pAllocateStatelessEventPoolSurface) {
            auto patchLocation = ptrOffset(reinterpret_cast<uint64_t *>(parentKernel->getCrossThreadData()),
                                           patchInfo.pAllocateStatelessEventPoolSurface->DataParamOffset);

            EXPECT_EQ(pDevQueue->getEventPoolBuffer()->getGpuAddress(), *patchLocation);
        }
    }
}

HWTEST_F(ParentKernelEnqueueFixture, GivenParentKernelWhenEnqueuedThenBlocksDSHOnReflectionSurfaceArePatchedWithDeviceQueueAndEventPoolAddresses) {

    if (pDevice->getSupportedClVersion() >= 20) {
        size_t offset[3] = {0, 0, 0};
        size_t gws[3] = {1, 1, 1};
        DeviceQueueHw<FamilyType> *pDevQueueHw = castToObject<DeviceQueueHw<FamilyType>>(pDevQueue);

        pCmdQ->enqueueKernel(parentKernel, 1, offset, gws, gws, 0, nullptr, nullptr);

        void *reflectionSurface = parentKernel->getKernelReflectionSurface()->getUnderlyingBuffer();

        BlockKernelManager *blockManager = parentKernel->getProgram()->getBlockKernelManager();
        uint32_t blockCount = static_cast<uint32_t>(blockManager->getCount());

        for (uint32_t i = 0; i < blockCount; i++) {
            const KernelInfo *pBlockInfo = blockManager->getBlockKernelInfo(i);

            uint32_t defaultQueueOffset = pBlockInfo->patchInfo.pAllocateStatelessDefaultDeviceQueueSurface->DataParamOffset;
            uint32_t eventPoolOffset = pBlockInfo->patchInfo.pAllocateStatelessEventPoolSurface->DataParamOffset;

            uint32_t defaultQueueSize = pBlockInfo->patchInfo.pAllocateStatelessDefaultDeviceQueueSurface->DataParamSize;
            uint32_t eventPoolSize = pBlockInfo->patchInfo.pAllocateStatelessEventPoolSurface->DataParamSize;

            uint32_t offset = MockKernel::ReflectionSurfaceHelperPublic::getConstantBufferOffset(reflectionSurface, i);

            if (defaultQueueSize == sizeof(uint64_t)) {
                EXPECT_EQ_VAL(pDevQueueHw->getQueueBuffer()->getGpuAddressToPatch(), *(uint64_t *)ptrOffset(reflectionSurface, offset + defaultQueueOffset));
            } else {
                EXPECT_EQ((uint32_t)pDevQueueHw->getQueueBuffer()->getGpuAddressToPatch(), *(uint32_t *)ptrOffset(reflectionSurface, offset + defaultQueueOffset));
            }

            if (eventPoolSize == sizeof(uint64_t)) {
                EXPECT_EQ_VAL(pDevQueueHw->getEventPoolBuffer()->getGpuAddressToPatch(), *(uint64_t *)ptrOffset(reflectionSurface, offset + eventPoolOffset));
            } else {
                EXPECT_EQ((uint32_t)pDevQueueHw->getEventPoolBuffer()->getGpuAddressToPatch(), *(uint32_t *)ptrOffset(reflectionSurface, offset + eventPoolOffset));
            }
        }
    }
}

HWTEST_F(ParentKernelEnqueueFixture, GivenParentKernelWhenEnqueuedToNonBlockedQueueThenDeviceQueueCriticalSetionIsAcquired) {

    if (pDevice->getSupportedClVersion() >= 20) {
        size_t offset[3] = {0, 0, 0};
        size_t gws[3] = {1, 1, 1};
        DeviceQueueHw<FamilyType> *pDevQueueHw = castToObject<DeviceQueueHw<FamilyType>>(pDevQueue);

        EXPECT_TRUE(pDevQueueHw->isEMCriticalSectionFree());

        pCmdQ->enqueueKernel(parentKernel, 1, offset, gws, gws, 0, nullptr, nullptr);

        EXPECT_FALSE(pDevQueueHw->isEMCriticalSectionFree());
    }
}

HWTEST_F(ParentKernelEnqueueFixture, GivenParentKernelWhenEnqueuedToBlockedQueueThenDeviceQueueCriticalSetionIsNotAcquired) {

    if (pDevice->getSupportedClVersion() >= 20) {
        size_t offset[3] = {0, 0, 0};
        size_t gws[3] = {1, 1, 1};
        DeviceQueueHw<FamilyType> *pDevQueueHw = castToObject<DeviceQueueHw<FamilyType>>(pDevQueue);

        MockEvent<UserEvent> mockEvent(context);
        cl_event eventBlocking = &mockEvent;

        EXPECT_TRUE(pDevQueueHw->isEMCriticalSectionFree());

        pCmdQ->enqueueKernel(parentKernel, 1, offset, gws, gws, 1, &eventBlocking, nullptr);

        EXPECT_TRUE(pDevQueueHw->isEMCriticalSectionFree());
    }
}

HWTEST_F(ParentKernelEnqueueFixture, ParentKernelEnqueuedToNonBlockedQueueFlushesCSRWithSLM) {

    if (pDevice->getSupportedClVersion() >= 20) {
        size_t offset[3] = {0, 0, 0};
        size_t gws[3] = {1, 1, 1};
        int32_t execStamp;
        auto mockCsr = new MockCsr<FamilyType>(execStamp, *pDevice->executionEnvironment);
        pDevice->resetCommandStreamReceiver(mockCsr);

        pCmdQ->enqueueKernel(parentKernel, 1, offset, gws, gws, 0, nullptr, nullptr);

        EXPECT_TRUE(mockCsr->slmUsedInLastFlushTask);
    }
}

HWCMDTEST_F(IGFX_GEN8_CORE, ParentKernelEnqueueFixture, ParentKernelEnqueuedWithSchedulerReturnInstanceRunsSimulation) {

    if (pDevice->getSupportedClVersion() >= 20) {

        DebugManagerStateRestore dbgRestorer;
        DebugManager.flags.SchedulerSimulationReturnInstance.set(1);

        MockDeviceQueueHw<FamilyType> *mockDeviceQueueHw = new MockDeviceQueueHw<FamilyType>(context, pDevice, DeviceHostQueue::deviceQueueProperties::minimumProperties[0]);
        mockDeviceQueueHw->resetDeviceQueue();

        context->setDefaultDeviceQueue(mockDeviceQueueHw);

        size_t offset[3] = {0, 0, 0};
        size_t gws[3] = {1, 1, 1};
        int32_t execStamp;
        auto mockCsr = new MockCsr<FamilyType>(execStamp, *pDevice->executionEnvironment);

        BuiltinKernelsSimulation::SchedulerSimulation<FamilyType>::enabled = false;

        pDevice->resetCommandStreamReceiver(mockCsr);

        pCmdQ->enqueueKernel(parentKernel, 1, offset, gws, gws, 0, nullptr, nullptr);

        BuiltinKernelsSimulation::SchedulerSimulation<FamilyType>::enabled = true;

        EXPECT_TRUE(BuiltinKernelsSimulation::SchedulerSimulation<FamilyType>::simulationRun);
        delete mockDeviceQueueHw;
    }
}

HWTEST_F(ParentKernelEnqueueFixture, givenCsrInBatchingModeWhenExecutionModelKernelIsSubmittedThenItIsFlushed) {
    if (pDevice->getSupportedClVersion() >= 20) {
        auto mockCsr = new MockCsrHw2<FamilyType>(pDevice->getHardwareInfo(), *pDevice->executionEnvironment);
        mockCsr->overrideDispatchPolicy(DispatchMode::BatchedDispatch);
        pDevice->resetCommandStreamReceiver(mockCsr);

        auto mockedSubmissionsAggregator = new mockSubmissionsAggregator();
        mockCsr->overrideSubmissionAggregator(mockedSubmissionsAggregator);

        size_t offset[3] = {0, 0, 0};
        size_t gws[3] = {1, 1, 1};

        MockContext context(pDevice);
        std::unique_ptr<MockParentKernel> kernelToRun(MockParentKernel::create(context, false, false, false, false, false));

        pCmdQ->enqueueKernel(kernelToRun.get(), 1, offset, gws, gws, 0, nullptr, nullptr);

        EXPECT_TRUE(mockedSubmissionsAggregator->peekCmdBufferList().peekIsEmpty());
        EXPECT_EQ(1, mockCsr->flushCalledCount);
    }
}

HWTEST_F(ParentKernelEnqueueFixture, ParentKernelEnqueueMarksCSRMediaVFEStateDirty) {

    if (pDevice->getSupportedClVersion() >= 20) {
        size_t offset[3] = {0, 0, 0};
        size_t gws[3] = {1, 1, 1};
        int32_t execStamp;
        auto mockCsr = new MockCsr<FamilyType>(execStamp, *pDevice->executionEnvironment);
        pDevice->resetCommandStreamReceiver(mockCsr);

        mockCsr->setMediaVFEStateDirty(false);
        pCmdQ->enqueueKernel(parentKernel, 1, offset, gws, gws, 0, nullptr, nullptr);

        EXPECT_TRUE(mockCsr->peekMediaVfeStateDirty());
    }
}