Assign gpgpu engine at first enqueue

Signed-off-by: Lukasz Jobczyk <lukasz.jobczyk@intel.com>
2025-09-10 12:53:42 +08:00 · 2022-04-29 08:02:40 +00:00
parent a6490062a9
commit 73d3d83e60
14 changed files with 149 additions and 78 deletions
--- a/opencl/source/command_queue/command_queue.cpp
+++ b/opencl/source/command_queue/command_queue.cpp
@ -73,14 +73,10 @@ CommandQueue::CommandQueue(Context *context, ClDevice *device, const cl_queue_pr
        auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
        auto hwInfoConfig = HwInfoConfig::get(hwInfo.platform.eProductFamily);

-        gpgpuEngine = &device->getDefaultEngine();
-
-        UNRECOVERABLE_IF(gpgpuEngine->getEngineType() >= aub_stream::EngineType::NUM_ENGINES);
-
        bool bcsAllowed = hwInfoConfig->isBlitterFullySupported(hwInfo) &&
                          hwHelper.isSubDeviceEngineSupported(hwInfo, device->getDeviceBitfield(), aub_stream::EngineType::ENGINE_BCS);

-        if (bcsAllowed || gpgpuEngine->commandStreamReceiver->peekTimestampPacketWriteEnabled()) {
+        if (bcsAllowed || device->getDefaultEngine().commandStreamReceiver->peekTimestampPacketWriteEnabled()) {
            timestampPacketContainer = std::make_unique<TimestampPacketContainer>();
            deferredTimestampPackets = std::make_unique<TimestampPacketContainer>();
        }
@ -104,9 +100,8 @@ CommandQueue::~CommandQueue() {
    }

    if (device) {
-        auto storageForAllocation = gpgpuEngine->commandStreamReceiver->getInternalAllocationStorage();
-
        if (commandStream) {
+            auto storageForAllocation = gpgpuEngine->commandStreamReceiver->getInternalAllocationStorage();
            storageForAllocation->storeAllocation(std::unique_ptr<GraphicsAllocation>(commandStream->getGraphicsAllocation()), REUSABLE_ALLOCATION);
        }
        delete commandStream;
@ -130,7 +125,59 @@ CommandQueue::~CommandQueue() {
    gtpinRemoveCommandQueue(this);
 }

+void CommandQueue::initializeGpgpu() const {
+    if (gpgpuEngine == nullptr) {
+        auto &hwInfo = device->getDevice().getHardwareInfo();
+        auto &hwHelper = NEO::HwHelper::get(hwInfo.platform.eRenderCoreFamily);
+
+        auto assignEngineRoundRobin =
+            !this->isSpecialCommandQueue &&
+            !this->queueFamilySelected &&
+            !(getCmdQueueProperties<cl_queue_priority_khr>(propertiesVector.data(), CL_QUEUE_PRIORITY_KHR) & static_cast<cl_queue_priority_khr>(CL_QUEUE_PRIORITY_LOW_KHR)) &&
+            hwHelper.isAssignEngineRoundRobinSupported() &&
+            this->isAssignEngineRoundRobinEnabled();
+
+        if (assignEngineRoundRobin) {
+            this->gpgpuEngine = &device->getDevice().getNextEngineForCommandQueue();
+        } else {
+            this->gpgpuEngine = &device->getDefaultEngine();
+        }
+
+        this->initializeGpgpuInternals();
+    }
+}
+
+void CommandQueue::initializeGpgpuInternals() const {
+    auto &hwInfo = device->getDevice().getHardwareInfo();
+    auto &hwHelper = NEO::HwHelper::get(hwInfo.platform.eRenderCoreFamily);
+
+    if (getCmdQueueProperties<cl_queue_properties>(propertiesVector.data(), CL_QUEUE_PROPERTIES) & static_cast<cl_queue_properties>(CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE)) {
+        this->gpgpuEngine->commandStreamReceiver->overrideDispatchPolicy(DispatchMode::BatchedDispatch);
+        if (DebugManager.flags.CsrDispatchMode.get() != 0) {
+            this->gpgpuEngine->commandStreamReceiver->overrideDispatchPolicy(static_cast<DispatchMode>(DebugManager.flags.CsrDispatchMode.get()));
+        }
+        this->gpgpuEngine->commandStreamReceiver->enableNTo1SubmissionModel();
+    }
+
+    if (device->getDevice().getDebugger() && !this->gpgpuEngine->commandStreamReceiver->getDebugSurfaceAllocation()) {
+        auto maxDbgSurfaceSize = hwHelper.getSipKernelMaxDbgSurfaceSize(hwInfo);
+        auto debugSurface = this->gpgpuEngine->commandStreamReceiver->allocateDebugSurface(maxDbgSurfaceSize);
+        memset(debugSurface->getUnderlyingBuffer(), 0, debugSurface->getUnderlyingBufferSize());
+
+        auto &stateSaveAreaHeader = SipKernel::getSipKernel(device->getDevice()).getStateSaveAreaHeader();
+        if (stateSaveAreaHeader.size() > 0) {
+            NEO::MemoryTransferHelper::transferMemoryToAllocation(hwHelper.isBlitCopyRequiredForLocalMemory(hwInfo, *debugSurface),
+                                                                  device->getDevice(), debugSurface, 0, stateSaveAreaHeader.data(),
+                                                                  stateSaveAreaHeader.size());
+        }
+    }
+
+    gpgpuEngine->osContext->ensureContextInitialized();
+    gpgpuEngine->commandStreamReceiver->initDirectSubmission();
+}
+
 CommandStreamReceiver &CommandQueue::getGpgpuCommandStreamReceiver() const {
+    this->initializeGpgpu();
    return *gpgpuEngine->commandStreamReceiver;
 }

@ -700,7 +747,7 @@ cl_uint CommandQueue::getQueueFamilyIndex() const {
    } else {
        const auto &hwInfo = device->getHardwareInfo();
        const auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
-        const auto engineGroupType = hwHelper.getEngineGroupType(gpgpuEngine->getEngineType(), gpgpuEngine->getEngineUsage(), hwInfo);
+        const auto engineGroupType = hwHelper.getEngineGroupType(getGpgpuEngine().getEngineType(), getGpgpuEngine().getEngineUsage(), hwInfo);
        const auto familyIndex = device->getDevice().getEngineGroupIndexFromEngineGroupType(engineGroupType);
        return static_cast<cl_uint>(familyIndex);
    }
--- a/opencl/source/command_queue/command_queue.h
+++ b/opencl/source/command_queue/command_queue.h
@ -222,6 +222,8 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
                                             cl_uint numEventsInWaitList,
                                             const cl_event *eventWaitList);

+    void initializeGpgpu() const;
+    void initializeGpgpuInternals() const;
    MOCKABLE_VIRTUAL CommandStreamReceiver &getGpgpuCommandStreamReceiver() const;
    MOCKABLE_VIRTUAL CommandStreamReceiver *getBcsCommandStreamReceiver(aub_stream::EngineType bcsEngineType) const;
    CommandStreamReceiver *getBcsForAuxTranslation() const;
@ -230,7 +232,10 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
    ClDevice &getClDevice() const { return *device; }
    Context &getContext() const { return *context; }
    Context *getContextPtr() const { return context; }
-    EngineControl &getGpgpuEngine() const { return *gpgpuEngine; }
+    EngineControl &getGpgpuEngine() const {
+        this->initializeGpgpu();
+        return *gpgpuEngine;
+    }

    MOCKABLE_VIRTUAL LinearStream &getCS(size_t minRequiredSize);
    IndirectHeap &getIndirectHeap(IndirectHeap::Type heapType,
@ -387,7 +392,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> {

    Context *context = nullptr;
    ClDevice *device = nullptr;
-    EngineControl *gpgpuEngine = nullptr;
+    mutable EngineControl *gpgpuEngine = nullptr;
    std::array<EngineControl *, bcsInfoMaskSize> bcsEngines = {};
    std::vector<aub_stream::EngineType> bcsEngineTypes = {};

--- a/opencl/source/command_queue/command_queue_hw.h
+++ b/opencl/source/command_queue/command_queue_hw.h
@ -62,39 +62,8 @@ class CommandQueueHw : public CommandQueue {
            this->gpgpuEngine = &device->getInternalEngine();
        }

-        auto &hwInfo = device->getDevice().getHardwareInfo();
-        auto &hwHelper = NEO::HwHelper::get(hwInfo.platform.eRenderCoreFamily);
-
-        auto assignEngineRoundRobin =
-            !internalUsage &&
-            !this->queueFamilySelected &&
-            !(clPriority & static_cast<cl_queue_priority_khr>(CL_QUEUE_PRIORITY_LOW_KHR)) &&
-            hwHelper.isAssignEngineRoundRobinSupported() &&
-            this->isAssignEngineRoundRobinEnabled();
-
-        if (assignEngineRoundRobin) {
-            this->gpgpuEngine = &device->getDevice().getNextEngineForCommandQueue();
-        }
-
-        if (getCmdQueueProperties<cl_queue_properties>(properties, CL_QUEUE_PROPERTIES) & static_cast<cl_queue_properties>(CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE)) {
-            getGpgpuCommandStreamReceiver().overrideDispatchPolicy(DispatchMode::BatchedDispatch);
-            if (DebugManager.flags.CsrDispatchMode.get() != 0) {
-                getGpgpuCommandStreamReceiver().overrideDispatchPolicy(static_cast<DispatchMode>(DebugManager.flags.CsrDispatchMode.get()));
-            }
-            getGpgpuCommandStreamReceiver().enableNTo1SubmissionModel();
-        }
-
-        if (device->getDevice().getDebugger() && !getGpgpuCommandStreamReceiver().getDebugSurfaceAllocation()) {
-            auto maxDbgSurfaceSize = hwHelper.getSipKernelMaxDbgSurfaceSize(hwInfo);
-            auto debugSurface = getGpgpuCommandStreamReceiver().allocateDebugSurface(maxDbgSurfaceSize);
-            memset(debugSurface->getUnderlyingBuffer(), 0, debugSurface->getUnderlyingBufferSize());
-
-            auto &stateSaveAreaHeader = SipKernel::getSipKernel(device->getDevice()).getStateSaveAreaHeader();
-            if (stateSaveAreaHeader.size() > 0) {
-                NEO::MemoryTransferHelper::transferMemoryToAllocation(hwHelper.isBlitCopyRequiredForLocalMemory(hwInfo, *debugSurface),
-                                                                      device->getDevice(), debugSurface, 0, stateSaveAreaHeader.data(),
-                                                                      stateSaveAreaHeader.size());
-            }
+        if (gpgpuEngine) {
+            this->initializeGpgpuInternals();
        }

        uint64_t requestedSliceCount = getCmdQueueProperties<cl_command_queue_properties>(properties, CL_QUEUE_SLICE_COUNT_INTEL);
@ -102,8 +71,16 @@ class CommandQueueHw : public CommandQueue {
            sliceCount = requestedSliceCount;
        }

-        gpgpuEngine->osContext->ensureContextInitialized();
-        gpgpuEngine->commandStreamReceiver->initDirectSubmission();
+        auto initializeGpgpu = false;
+
+        if (DebugManager.flags.DeferCmdQGpgpuInitialization.get() != -1) {
+            initializeGpgpu = !DebugManager.flags.DeferCmdQGpgpuInitialization.get();
+        }
+
+        if (initializeGpgpu) {
+            this->initializeGpgpu();
+        }
+
        for (const EngineControl *engine : bcsEngines) {
            if (engine != nullptr) {
                engine->osContext->ensureContextInitialized();
--- a/opencl/test/unit_test/api/cl_create_command_queue_tests.inl
+++ b/opencl/test/unit_test/api/cl_create_command_queue_tests.inl
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2021 Intel Corporation
+ * Copyright (C) 2018-2022 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@ -71,6 +71,7 @@ TEST_F(clCreateCommandQueueTest, GivenOoqParametersWhenQueueIsCreatedThenQueueIs
 }

 HWTEST_F(clCreateCommandQueueTest, GivenOoqParametersWhenQueueIsCreatedThenCommandStreamReceiverSwitchesToBatchingMode) {
+    using BaseType = typename CommandQueue::BaseType;
    cl_int retVal = CL_SUCCESS;
    cl_queue_properties ooq = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
    auto clDevice = castToObject<ClDevice>(testedClDevice);
@ -79,7 +80,8 @@ HWTEST_F(clCreateCommandQueueTest, GivenOoqParametersWhenQueueIsCreatedThenComma
    EXPECT_EQ(DispatchMode::ImmediateDispatch, csr.dispatchMode);

    auto cmdq = clCreateCommandQueue(pContext, testedClDevice, ooq, &retVal);
-    EXPECT_EQ(DispatchMode::BatchedDispatch, csr.dispatchMode);
+    auto queue = castToObject<CommandQueue>(static_cast<BaseType *>(cmdq));
+    EXPECT_EQ(DispatchMode::BatchedDispatch, queue->getGpgpuCommandStreamReceiver().getDispatchMode());
    retVal = clReleaseCommandQueue(cmdq);
 }

@ -100,6 +102,7 @@ HWTEST_F(clCreateCommandQueueTest, GivenForcedDispatchModeAndOoqParametersWhenQu
 }

 HWTEST_F(clCreateCommandQueueTest, GivenOoqParametersWhenQueueIsCreatedThenCommandStreamReceiverSwitchesToNTo1SubmissionModel) {
+    using BaseType = typename CommandQueue::BaseType;
    cl_int retVal = CL_SUCCESS;
    cl_queue_properties ooq = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
    auto clDevice = castToObject<ClDevice>(testedClDevice);
@ -108,7 +111,8 @@ HWTEST_F(clCreateCommandQueueTest, GivenOoqParametersWhenQueueIsCreatedThenComma
    EXPECT_FALSE(csr.isNTo1SubmissionModelEnabled());

    auto cmdq = clCreateCommandQueue(pContext, testedClDevice, ooq, &retVal);
-    EXPECT_TRUE(csr.isNTo1SubmissionModelEnabled());
+    auto queue = castToObject<CommandQueue>(static_cast<BaseType *>(cmdq));
+    EXPECT_TRUE(queue->getGpgpuCommandStreamReceiver().isNTo1SubmissionModelEnabled());
    retVal = clReleaseCommandQueue(cmdq);
 }

--- a/opencl/test/unit_test/command_queue/blit_enqueue_tests.cpp
+++ b/opencl/test/unit_test/command_queue/blit_enqueue_tests.cpp
@ -106,7 +106,7 @@ struct BlitEnqueueTests : public ::testing::Test {
        auto mockProgram = mockKernel->mockProgram;
        mockProgram->setAllowNonUniform(true);

-        gpgpuCsr = mockCmdQueue->gpgpuEngine->commandStreamReceiver;
+        gpgpuCsr = &mockCmdQueue->getGpgpuCommandStreamReceiver();
        bcsCsr = mockCmdQueue->bcsEngines[0]->commandStreamReceiver;
    }

--- a/opencl/test/unit_test/command_queue/command_queue_hw_1_tests.cpp
+++ b/opencl/test/unit_test/command_queue/command_queue_hw_1_tests.cpp
@ -32,11 +32,11 @@ HWTEST_F(CommandQueueHwTest, WhenConstructingTwoCommandQueuesThenOnlyOneDebugSur

    MockCommandQueueHw<FamilyType> mockCmdQueueHw1(context, device.get(), nullptr);

-    auto dbgSurface = device->getGpgpuCommandStreamReceiver().getDebugSurfaceAllocation();
+    auto dbgSurface = mockCmdQueueHw1.getGpgpuCommandStreamReceiver().getDebugSurfaceAllocation();
    EXPECT_NE(dbgSurface, nullptr);

    MockCommandQueueHw<FamilyType> mockCmdQueueHw2(context, device.get(), nullptr);
-    EXPECT_EQ(dbgSurface, device->getGpgpuCommandStreamReceiver().getDebugSurfaceAllocation());
+    EXPECT_EQ(dbgSurface, mockCmdQueueHw1.getGpgpuCommandStreamReceiver().getDebugSurfaceAllocation());
 }

 HWTEST_F(CommandQueueHwTest, givenNoTimestampPacketsWhenWaitForTimestampsThenNoWaitAndTagIsNotUpdated) {
@ -63,7 +63,7 @@ HWTEST_F(CommandQueueHwTest, WhenDebugSurfaceIsAllocatedThenBufferIsZeroed) {

    MockCommandQueueHw<FamilyType> mockCmdQueueHw1(context, device.get(), nullptr);

-    auto dbgSurface = device->getGpgpuCommandStreamReceiver().getDebugSurfaceAllocation();
+    auto dbgSurface = mockCmdQueueHw1.getGpgpuCommandStreamReceiver().getDebugSurfaceAllocation();
    EXPECT_NE(dbgSurface, nullptr);
    auto mem = dbgSurface->getUnderlyingBuffer();
    ASSERT_NE(nullptr, mem);
@ -96,7 +96,7 @@ HWTEST_F(CommandQueueHwTest, WhenConstructingCommandQueueDebugOnButIgcDoesNotRet

    MockCommandQueueHw<FamilyType> mockCmdQueueHw1(context, device.get(), nullptr);

-    auto dbgSurface = device->getGpgpuCommandStreamReceiver().getDebugSurfaceAllocation();
+    auto dbgSurface = mockCmdQueueHw1.getGpgpuCommandStreamReceiver().getDebugSurfaceAllocation();
    EXPECT_NE(dbgSurface, nullptr);

    auto &stateSaveAreaHeader = SipKernel::getSipKernel(device->getDevice()).getStateSaveAreaHeader();
--- a/opencl/test/unit_test/command_queue/command_queue_tests.cpp
+++ b/opencl/test/unit_test/command_queue/command_queue_tests.cpp
@ -1089,7 +1089,7 @@ HWTEST_F(WaitUntilCompletionTests, givenCleanTemporaryAllocationListEqualsFalseW
    cmdStream->waitForTaskCountReturnValue = WaitStatus::Ready;

    std::unique_ptr<MyCmdQueue<FamilyType>> cmdQ(new MyCmdQueue<FamilyType>(context.get(), device.get()));
-    CommandStreamReceiver *oldCommandStreamReceiver = cmdQ->gpgpuEngine->commandStreamReceiver;
+    CommandStreamReceiver *oldCommandStreamReceiver = &cmdQ->getGpgpuCommandStreamReceiver();
    cmdQ->gpgpuEngine->commandStreamReceiver = cmdStream.get();

    constexpr uint32_t taskCount = 0u;
@ -1109,7 +1109,7 @@ HWTEST_F(WaitUntilCompletionTests, givenGpuHangAndCleanTemporaryAllocationListEq
    cmdStream->waitForTaskCountAndCleanTemporaryAllocationListReturnValue = WaitStatus::GpuHang;

    std::unique_ptr<MyCmdQueue<FamilyType>> cmdQ(new MyCmdQueue<FamilyType>(context.get(), device.get()));
-    CommandStreamReceiver *oldCommandStreamReceiver = cmdQ->gpgpuEngine->commandStreamReceiver;
+    CommandStreamReceiver *oldCommandStreamReceiver = &cmdQ->getGpgpuCommandStreamReceiver();
    cmdQ->gpgpuEngine->commandStreamReceiver = cmdStream.get();

    constexpr uint32_t taskCount = 0u;
@ -1128,7 +1128,7 @@ HWTEST_F(WaitUntilCompletionTests, givenEmptyBcsStatesAndSkipWaitEqualsTrueWhenW
    cmdStream->initializeTagAllocation();

    std::unique_ptr<MyCmdQueue<FamilyType>> cmdQ(new MyCmdQueue<FamilyType>(context.get(), device.get()));
-    CommandStreamReceiver *oldCommandStreamReceiver = cmdQ->gpgpuEngine->commandStreamReceiver;
+    CommandStreamReceiver *oldCommandStreamReceiver = &cmdQ->getGpgpuCommandStreamReceiver();
    cmdQ->gpgpuEngine->commandStreamReceiver = cmdStream.get();

    constexpr uint32_t taskCount = 0u;
@ -1147,7 +1147,7 @@ HWTEST_F(WaitUntilCompletionTests, givenGpuHangAndSkipWaitEqualsFalseWhenWaiting
    cmdStream->waitForTaskCountWithKmdNotifyFallbackReturnValue = WaitStatus::GpuHang;

    std::unique_ptr<MyCmdQueue<FamilyType>> cmdQ(new MyCmdQueue<FamilyType>(context.get(), device.get()));
-    CommandStreamReceiver *oldCommandStreamReceiver = cmdQ->gpgpuEngine->commandStreamReceiver;
+    CommandStreamReceiver *oldCommandStreamReceiver = &cmdQ->getGpgpuCommandStreamReceiver();
    cmdQ->gpgpuEngine->commandStreamReceiver = cmdStream.get();

    constexpr uint32_t taskCount = 0u;
@ -1174,7 +1174,7 @@ HWTEST_F(WaitUntilCompletionTests, givenGpuHangOnBcsCsrWhenWaitingUntilCompleteT
    bcsCmdStream->waitForTaskCountWithKmdNotifyFallbackReturnValue = WaitStatus::GpuHang;

    std::unique_ptr<MyCmdQueue<FamilyType>> cmdQ(new MyCmdQueue<FamilyType>(context.get(), device.get()));
-    CommandStreamReceiver *oldCommandStreamReceiver = cmdQ->gpgpuEngine->commandStreamReceiver;
+    CommandStreamReceiver *oldCommandStreamReceiver = &cmdQ->getGpgpuCommandStreamReceiver();
    cmdQ->gpgpuEngine->commandStreamReceiver = gpgpuCmdStream.get();
    cmdQ->bcsCsrToReturn = bcsCmdStream.get();

@ -1207,7 +1207,7 @@ HWTEST_F(WaitUntilCompletionTests, givenGpuHangOnBcsCsrWhenWaitingUntilCompleteT
    bcsCmdStream->waitForTaskCountAndCleanTemporaryAllocationListReturnValue = WaitStatus::GpuHang;

    std::unique_ptr<MyCmdQueue<FamilyType>> cmdQ(new MyCmdQueue<FamilyType>(context.get(), device.get()));
-    CommandStreamReceiver *oldCommandStreamReceiver = cmdQ->gpgpuEngine->commandStreamReceiver;
+    CommandStreamReceiver *oldCommandStreamReceiver = &cmdQ->getGpgpuCommandStreamReceiver();
    cmdQ->gpgpuEngine->commandStreamReceiver = gpgpuCmdStream.get();
    cmdQ->bcsCsrToReturn = bcsCmdStream.get();

@ -1241,7 +1241,7 @@ HWTEST_F(WaitUntilCompletionTests, givenSuccessOnBcsCsrWhenWaitingUntilCompleteT
    bcsCmdStream->waitForTaskCountAndCleanTemporaryAllocationListReturnValue = WaitStatus::Ready;

    std::unique_ptr<MyCmdQueue<FamilyType>> cmdQ(new MyCmdQueue<FamilyType>(context.get(), device.get()));
-    CommandStreamReceiver *oldCommandStreamReceiver = cmdQ->gpgpuEngine->commandStreamReceiver;
+    CommandStreamReceiver *oldCommandStreamReceiver = &cmdQ->getGpgpuCommandStreamReceiver();
    cmdQ->gpgpuEngine->commandStreamReceiver = gpgpuCmdStream.get();
    cmdQ->bcsCsrToReturn = bcsCmdStream.get();

@ -2803,7 +2803,7 @@ TEST_F(MultiTileFixture, givenDefaultContextWithRootDeviceWhenQueueIsCreatedThen
    auto rootCsr = rootDevice->getDefaultEngine().commandStreamReceiver;

    MockCommandQueue queue(&context, rootDevice, nullptr, false);
-    ASSERT_NE(nullptr, queue.gpgpuEngine);
+    ASSERT_NE(nullptr, &queue.getGpgpuEngine());
    EXPECT_EQ(rootCsr->isMultiOsContextCapable(), queue.getGpgpuCommandStreamReceiver().isMultiOsContextCapable());
    EXPECT_EQ(rootCsr, queue.gpgpuEngine->commandStreamReceiver);
 }
@ -2814,7 +2814,7 @@ TEST_F(MultiTileFixture, givenDefaultContextWithSubdeviceWhenQueueIsCreatedThenQ
    context.contextType = ContextType::CONTEXT_TYPE_DEFAULT;

    MockCommandQueue queue(&context, subdevice, nullptr, false);
-    ASSERT_NE(nullptr, queue.gpgpuEngine);
+    ASSERT_NE(nullptr, &queue.getGpgpuEngine());
    EXPECT_FALSE(queue.getGpgpuCommandStreamReceiver().isMultiOsContextCapable());
 }

@ -2826,7 +2826,7 @@ TEST_F(MultiTileFixture, givenUnrestrictiveContextWithRootDeviceWhenQueueIsCreat
    auto rootCsr = rootDevice->getDefaultEngine().commandStreamReceiver;

    MockCommandQueue queue(&context, rootDevice, nullptr, false);
-    ASSERT_NE(nullptr, queue.gpgpuEngine);
+    ASSERT_NE(nullptr, &queue.getGpgpuEngine());
    EXPECT_EQ(rootCsr->isMultiOsContextCapable(), queue.getGpgpuCommandStreamReceiver().isMultiOsContextCapable());
    EXPECT_EQ(rootCsr, queue.gpgpuEngine->commandStreamReceiver);
 }
@ -2840,7 +2840,7 @@ TEST_F(MultiTileFixture, givenNotDefaultContextWithRootDeviceAndTileIdMaskWhenQu
    auto rootCsr = rootDevice->getDefaultEngine().commandStreamReceiver;

    MockCommandQueue queue(&context, rootClDevice, nullptr, false);
-    ASSERT_NE(nullptr, queue.gpgpuEngine);
+    ASSERT_NE(nullptr, &queue.getGpgpuEngine());
    EXPECT_EQ(rootCsr->isMultiOsContextCapable(), queue.getGpgpuCommandStreamReceiver().isMultiOsContextCapable());
    EXPECT_EQ(rootCsr, queue.gpgpuEngine->commandStreamReceiver);
 }
--- a/opencl/test/unit_test/command_queue/command_queue_tests_pvc_and_later.cpp
+++ b/opencl/test/unit_test/command_queue/command_queue_tests_pvc_and_later.cpp
@ -144,12 +144,48 @@ HWTEST2_F(CommandQueuePvcAndLaterTests, givenCooperativeEngineUsageHintAndCcsWhe
        for (size_t i = 0; i < 4; i++) {
            propertiesCooperativeQueue[3] = i;
            auto pCommandQueue = std::make_unique<MockCommandQueueHw<FamilyType>>(&context, pDevice.get(), propertiesCooperativeQueue);
-            EXPECT_EQ(aub_stream::ENGINE_CCS + i, pCommandQueue->gpgpuEngine->osContext->getEngineType());
-            EXPECT_EQ(EngineUsage::Cooperative, pCommandQueue->gpgpuEngine->osContext->getEngineUsage());
+            EXPECT_EQ(aub_stream::ENGINE_CCS + i, pCommandQueue->getGpgpuEngine().osContext->getEngineType());
+            EXPECT_EQ(EngineUsage::Cooperative, pCommandQueue->getGpgpuEngine().osContext->getEngineUsage());
        }
    }
 }

+HWTEST2_F(CommandQueuePvcAndLaterTests, givenDeferCmdQGpgpuInitializationEnabledWhenCreateCommandQueueThenGpgpuIsNullptr, IsAtLeastXeHpcCore) {
+    DebugManagerStateRestore restorer;
+    DebugManager.flags.DeferCmdQGpgpuInitialization.set(1u);
+
+    HardwareInfo hwInfo = *defaultHwInfo;
+    MockDevice *device = MockDevice::createWithNewExecutionEnvironment<MockDevice>(&hwInfo, 0);
+    MockClDevice clDevice{device};
+    cl_device_id clDeviceId = static_cast<cl_device_id>(&clDevice);
+    ClDeviceVector clDevices{&clDeviceId, 1u};
+    cl_int retVal{};
+    auto context = std::unique_ptr<Context>{Context::create<Context>(nullptr, clDevices, nullptr, nullptr, retVal)};
+    EXPECT_EQ(CL_SUCCESS, retVal);
+
+    auto queue = std::make_unique<MockCommandQueueHw<FamilyType>>(context.get(), &clDevice, nullptr);
+
+    EXPECT_EQ(nullptr, queue->gpgpuEngine);
+}
+
+HWTEST2_F(CommandQueuePvcAndLaterTests, givenDeferCmdQGpgpuInitializationDisabledWhenCreateCommandQueueThenGpgpuIsnotNullptr, IsAtLeastXeHpcCore) {
+    DebugManagerStateRestore restorer;
+    DebugManager.flags.DeferCmdQGpgpuInitialization.set(0u);
+
+    HardwareInfo hwInfo = *defaultHwInfo;
+    MockDevice *device = MockDevice::createWithNewExecutionEnvironment<MockDevice>(&hwInfo, 0);
+    MockClDevice clDevice{device};
+    cl_device_id clDeviceId = static_cast<cl_device_id>(&clDevice);
+    ClDeviceVector clDevices{&clDeviceId, 1u};
+    cl_int retVal{};
+    auto context = std::unique_ptr<Context>{Context::create<Context>(nullptr, clDevices, nullptr, nullptr, retVal)};
+    EXPECT_EQ(CL_SUCCESS, retVal);
+
+    auto queue = std::make_unique<MockCommandQueueHw<FamilyType>>(context.get(), &clDevice, nullptr);
+
+    EXPECT_NE(nullptr, queue->gpgpuEngine);
+}
+
 struct BcsCsrSelectionCommandQueueTests : ::testing::Test {
    void SetUp() override {
        HardwareInfo hwInfo = *::defaultHwInfo;
--- a/opencl/test/unit_test/command_queue/enqueue_read_image_tests.cpp
+++ b/opencl/test/unit_test/command_queue/enqueue_read_image_tests.cpp
@ -108,7 +108,7 @@ HWTEST_F(EnqueueReadImageTest, givenCommandQueueAndFailingAllocationForHostSurfa
    auto failCsr = std::make_unique<CreateAllocationForHostSurfaceFailCsr<FamilyType>>(*pDevice->getExecutionEnvironment(), pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield());

    failCsr->setupContext(*pDevice->getDefaultEngine().osContext);
-    CommandStreamReceiver *oldCommandStreamReceiver = cmdQ.gpgpuEngine->commandStreamReceiver;
+    CommandStreamReceiver *oldCommandStreamReceiver = &cmdQ.getGpgpuCommandStreamReceiver();
    cmdQ.gpgpuEngine->commandStreamReceiver = failCsr.get();

    auto srcImage = Image2dHelper<>::create(context);
@ -132,7 +132,7 @@ HWTEST_F(EnqueueReadImageTest, givenCommandQueueAndFailingAllocationForHostSurfa
    auto failCsr = std::make_unique<CreateAllocationForHostSurfaceFailCsr<FamilyType>>(*pDevice->getExecutionEnvironment(), pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield());

    failCsr->setupContext(*pDevice->getDefaultEngine().osContext);
-    CommandStreamReceiver *oldCommandStreamReceiver = cmdQ.gpgpuEngine->commandStreamReceiver;
+    CommandStreamReceiver *oldCommandStreamReceiver = &cmdQ.getGpgpuCommandStreamReceiver();
    cmdQ.gpgpuEngine->commandStreamReceiver = failCsr.get();

    auto srcImage = Image2dHelper<>::create(context);
@ -175,7 +175,7 @@ HWTEST_F(EnqueueReadImageTest, givenCommandQueueAndPtrCopyAllowedForHostSurfaceW
    auto cmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(context, pClDevice, nullptr);

    csr->setupContext(*pDevice->getDefaultEngine().osContext);
-    CommandStreamReceiver *oldCommandStreamReceiver = cmdQ->gpgpuEngine->commandStreamReceiver;
+    CommandStreamReceiver *oldCommandStreamReceiver = &cmdQ->getGpgpuCommandStreamReceiver();
    cmdQ->gpgpuEngine->commandStreamReceiver = csr.get();
    csr->initializeTagAllocation();

@ -199,7 +199,7 @@ HWTEST_F(EnqueueReadImageTest, givenGpuHangAndCommandQueueAndPtrCopyAllowedForHo
    cmdQ->waitForAllEnginesReturnValue = WaitStatus::GpuHang;

    csr->setupContext(*pDevice->getDefaultEngine().osContext);
-    CommandStreamReceiver *oldCommandStreamReceiver = cmdQ->gpgpuEngine->commandStreamReceiver;
+    CommandStreamReceiver *oldCommandStreamReceiver = &cmdQ->getGpgpuCommandStreamReceiver();
    cmdQ->gpgpuEngine->commandStreamReceiver = csr.get();
    csr->initializeTagAllocation();

--- a/opencl/test/unit_test/command_queue/enqueue_svm_tests.cpp
+++ b/opencl/test/unit_test/command_queue/enqueue_svm_tests.cpp
@ -2135,7 +2135,7 @@ HWTEST_F(EnqueueSvmTest, GivenDstHostPtrWhenHostPtrAllocationCreationFailsThenRe
    void *pSrcSVM = ptrSVM;
    MockCommandQueueHw<FamilyType> cmdQ(context, pClDevice, nullptr);
    auto failCsr = std::make_unique<FailCsr<FamilyType>>(*pDevice->getExecutionEnvironment(), pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield());
-    CommandStreamReceiver *oldCommandStreamReceiver = cmdQ.gpgpuEngine->commandStreamReceiver;
+    CommandStreamReceiver *oldCommandStreamReceiver = &cmdQ.getGpgpuCommandStreamReceiver();
    cmdQ.gpgpuEngine->commandStreamReceiver = failCsr.get();
    retVal = cmdQ.enqueueSVMMemcpy(
        false,   // cl_bool  blocking_copy
@ -2156,7 +2156,7 @@ HWTEST_F(EnqueueSvmTest, GivenSrcHostPtrAndSizeZeroWhenHostPtrAllocationCreation
    void *pSrcSVM = srcHostPtr;
    MockCommandQueueHw<FamilyType> cmdQ(context, pClDevice, nullptr);
    auto failCsr = std::make_unique<FailCsr<FamilyType>>(*pDevice->getExecutionEnvironment(), pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield());
-    CommandStreamReceiver *oldCommandStreamReceiver = cmdQ.gpgpuEngine->commandStreamReceiver;
+    CommandStreamReceiver *oldCommandStreamReceiver = &cmdQ.getGpgpuCommandStreamReceiver();
    cmdQ.gpgpuEngine->commandStreamReceiver = failCsr.get();
    retVal = cmdQ.enqueueSVMMemcpy(
        false,   // cl_bool  blocking_copy
@ -2178,7 +2178,7 @@ HWTEST_F(EnqueueSvmTest, givenDstHostPtrAndSrcHostPtrWhenHostPtrAllocationCreati
    void *pSrcSVM = srcHostPtr;
    MockCommandQueueHw<FamilyType> cmdQ(context, pClDevice, nullptr);
    auto failCsr = std::make_unique<FailCsr<FamilyType>>(*pDevice->getExecutionEnvironment(), pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield());
-    CommandStreamReceiver *oldCommandStreamReceiver = cmdQ.gpgpuEngine->commandStreamReceiver;
+    CommandStreamReceiver *oldCommandStreamReceiver = &cmdQ.getGpgpuCommandStreamReceiver();
    cmdQ.gpgpuEngine->commandStreamReceiver = failCsr.get();
    retVal = cmdQ.enqueueSVMMemcpy(
        false,   // cl_bool  blocking_copy
--- a/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp
+++ b/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp
@ -500,7 +500,7 @@ HWTEST_TEMPLATED_F(BcsBufferTests, givenWriteBufferEnqueueWithGpgpuSubmissionWhe

    auto cmdQ = clUniquePtr(new MockCommandQueueHw<FamilyType>(bcsMockContext.get(), device.get(), nullptr));

-    auto queueCsr = cmdQ->gpgpuEngine->commandStreamReceiver;
+    auto queueCsr = &cmdQ->getGpgpuCommandStreamReceiver();
    auto initialTaskCount = queueCsr->peekTaskCount();

    cl_int retVal = CL_SUCCESS;
@ -531,7 +531,7 @@ HWTEST_TEMPLATED_F(BcsBufferTests, givenReadBufferEnqueueWithGpgpuSubmissionWhen

    auto cmdQ = clUniquePtr(new MockCommandQueueHw<FamilyType>(bcsMockContext.get(), device.get(), nullptr));

-    auto queueCsr = cmdQ->gpgpuEngine->commandStreamReceiver;
+    auto queueCsr = &cmdQ->getGpgpuCommandStreamReceiver();
    auto initialTaskCount = queueCsr->peekTaskCount();

    cl_int retVal = CL_SUCCESS;
@ -627,7 +627,7 @@ HWTEST_TEMPLATED_F(BcsBufferTests, givenPipeControlRequestWhenDispatchingBlitEnq
    auto cmdQ = clUniquePtr(new MockCommandQueueHw<FamilyType>(bcsMockContext.get(), device.get(), nullptr));
    auto bcsCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(this->bcsCsr);

-    auto queueCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(cmdQ->gpgpuEngine->commandStreamReceiver);
+    auto queueCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(&cmdQ->getGpgpuCommandStreamReceiver());
    queueCsr->stallingCommandsOnNextFlushRequired = true;

    cl_int retVal = CL_SUCCESS;
@ -726,7 +726,7 @@ HWTEST_TEMPLATED_F(BcsBufferTests, givenPipeControlRequestWhenDispatchingBlocked
    auto cmdQ = clUniquePtr(new MockCommandQueueHw<FamilyType>(bcsMockContext.get(), device.get(), nullptr));
    auto bcsCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(this->bcsCsr);

-    auto queueCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(cmdQ->gpgpuEngine->commandStreamReceiver);
+    auto queueCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(&cmdQ->getGpgpuCommandStreamReceiver());
    queueCsr->stallingCommandsOnNextFlushRequired = true;

    cl_int retVal = CL_SUCCESS;
--- a/opencl/test/unit_test/mocks/mock_command_queue.h
+++ b/opencl/test/unit_test/mocks/mock_command_queue.h
@ -282,7 +282,7 @@ class MockCommandQueueHw : public CommandQueueHw<GfxFamily> {
    }

    UltCommandStreamReceiver<GfxFamily> &getUltCommandStreamReceiver() {
-        return reinterpret_cast<UltCommandStreamReceiver<GfxFamily> &>(*BaseClass::gpgpuEngine->commandStreamReceiver);
+        return reinterpret_cast<UltCommandStreamReceiver<GfxFamily> &>(BaseClass::getGpgpuCommandStreamReceiver());
    }

    cl_int enqueueWriteImage(Image *dstImage,
--- a/opencl/test/unit_test/test_files/igdrcl.config
+++ b/opencl/test/unit_test/test_files/igdrcl.config
@ -382,6 +382,7 @@ ForceExtendedKernelIsaSize = -1
 MakeIndirectAllocationsResidentAsPack = -1
 MakeEachAllocationResident = -1
 AssignBCSAtEnqueue = -1
+DeferCmdQGpgpuInitialization = -1
 ReuseKernelBinaries = -1
 EnableChipsetUniqueUUID = -1
 ForceSimdMessageSizeInWalker = -1
--- a/shared/source/debug_settings/debug_variables_base.inl
+++ b/shared/source/debug_settings/debug_variables_base.inl
@ -267,6 +267,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, ResolveDependenciesViaPipeControls, -1, "-1: def
 DECLARE_DEBUG_VARIABLE(int32_t, MakeIndirectAllocationsResidentAsPack, -1, "-1: default, 0:disabled, 1: enabled. If enabled, driver handles all indirect allocations as one pack instead of making them resident individually.")
 DECLARE_DEBUG_VARIABLE(int32_t, MakeEachAllocationResident, -1, "-1: default, 0: disabled, 1: bind every allocation at creation time, 2: bind all created allocations in flush")
 DECLARE_DEBUG_VARIABLE(int32_t, AssignBCSAtEnqueue, -1, "-1: default, 0:disabled, 1: enabled.")
+DECLARE_DEBUG_VARIABLE(int32_t, DeferCmdQGpgpuInitialization, -1, "-1: default, 0:disabled, 1: enabled.")
 DECLARE_DEBUG_VARIABLE(int32_t, ReuseKernelBinaries, -1, "-1: default, 0:disabled, 1: enabled. If enabled, driver reuses kernel binaries.")

 /*DIRECT SUBMISSION FLAGS*/