diff --git a/runtime/command_queue/command_queue.h b/runtime/command_queue/command_queue.h
index 54159578a9..ac11070f74 100644
--- a/runtime/command_queue/command_queue.h
+++ b/runtime/command_queue/command_queue.h
@@ -402,6 +402,10 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
 
     MOCKABLE_VIRTUAL bool setupDebugSurface(Kernel *kernel);
 
+    bool getRequiresCacheFlushAfterWalker() const {
+        return requiresCacheFlushAfterWalker;
+    }
+
     // taskCount of last task
     uint32_t taskCount = 0;
 
@@ -451,6 +455,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
 
     bool mapDcFlushRequired = false;
     bool isSpecialCommandQueue = false;
+    bool requiresCacheFlushAfterWalker = false;
 
     std::unique_ptr<TimestampPacketContainer> timestampPacketContainer;
 
diff --git a/runtime/command_queue/command_queue_hw.h b/runtime/command_queue/command_queue_hw.h
index 1dcf91362f..202edfd960 100644
--- a/runtime/command_queue/command_queue_hw.h
+++ b/runtime/command_queue/command_queue_hw.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2017-2018 Intel Corporation
+ * Copyright (C) 2017-2019 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -55,8 +55,12 @@ class CommandQueueHw : public CommandQueue {
             getCommandStreamReceiver().overrideDispatchPolicy(DispatchMode::BatchedDispatch);
             getCommandStreamReceiver().enableNTo1SubmissionModel();
         }
+
+        this->requiresCacheFlushAfterWalker = CommandQueueHw<GfxFamily>::requiresCacheFlushAfterWalkerBasedOnProperties(properties);
     }
 
+    static bool requiresCacheFlushAfterWalkerBasedOnProperties(const cl_queue_properties *properties);
+
     static CommandQueue *create(Context *context,
                                 Device *device,
                                 const cl_queue_properties *properties) {
diff --git a/runtime/command_queue/command_queue_hw.inl b/runtime/command_queue/command_queue_hw.inl
index ce53b1f5f4..7a02b7e0fe 100644
--- a/runtime/command_queue/command_queue_hw.inl
+++ b/runtime/command_queue/command_queue_hw.inl
@@ -41,4 +41,8 @@ void CommandQueueHw<Family>::notifyEnqueueReadImage(Image *image, bool blockingR
         image->getGraphicsAllocation()->setAllocDumpable(blockingRead);
     }
 }
+template <typename Family>
+bool CommandQueueHw<Family>::requiresCacheFlushAfterWalkerBasedOnProperties(const cl_queue_properties *properties) {
+    return false;
+}
 } // namespace OCLRT
diff --git a/runtime/command_queue/gpgpu_walker.inl b/runtime/command_queue/gpgpu_walker.inl
index 523e84274b..6e4d01dc37 100644
--- a/runtime/command_queue/gpgpu_walker.inl
+++ b/runtime/command_queue/gpgpu_walker.inl
@@ -417,7 +417,7 @@ template <typename GfxFamily>
 size_t EnqueueOperation<GfxFamily>::getSizeRequiredCSKernel(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel) {
     size_t size = sizeof(typename GfxFamily::GPGPU_WALKER) + KernelCommandsHelper<GfxFamily>::getSizeRequiredCS(pKernel) +
                   sizeof(PIPE_CONTROL) * (KernelCommandsHelper<GfxFamily>::isPipeControlWArequired() ? 2 : 1);
-    size += KernelCommandsHelper<GfxFamily>::getSizeRequiredForCacheFlush(pKernel, 0U, 0U);
+    size += KernelCommandsHelper<GfxFamily>::getSizeRequiredForCacheFlush(commandQueue, pKernel, 0U, 0U);
     size += PreemptionHelper::getPreemptionWaCsSize<GfxFamily>(commandQueue.getDevice());
     if (reserveProfilingCmdsSpace) {
         size += 2 * sizeof(PIPE_CONTROL) + 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
diff --git a/runtime/command_queue/hardware_interface.inl b/runtime/command_queue/hardware_interface.inl
index 6f0c970d68..47bdbb0500 100644
--- a/runtime/command_queue/hardware_interface.inl
+++ b/runtime/command_queue/hardware_interface.inl
@@ -214,7 +214,7 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
             *pPipeControlCmd = GfxFamily::cmdInitPipeControl;
             pPipeControlCmd->setCommandStreamerStallEnable(true);
         }
-        KernelCommandsHelper<GfxFamily>::programCacheFlushAfterWalkerCommand(commandStream, &kernel, 0U, 0U);
+        KernelCommandsHelper<GfxFamily>::programCacheFlushAfterWalkerCommand(commandStream, commandQueue, &kernel, 0U, 0U);
 
         currentDispatchIndex++;
     }
diff --git a/runtime/gen10/command_queue_gen10.cpp b/runtime/gen10/command_queue_gen10.cpp
index e0fb1237e8..b75c0dc9b0 100644
--- a/runtime/gen10/command_queue_gen10.cpp
+++ b/runtime/gen10/command_queue_gen10.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2017-2018 Intel Corporation
+ * Copyright (C) 2017-2019 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -14,6 +14,8 @@ namespace OCLRT {
 typedef CNLFamily Family;
 static auto gfxCore = IGFX_GEN10_CORE;
 
+template class CommandQueueHw<Family>;
+
 template <>
 void populateFactoryTable<CommandQueueHw<Family>>() {
     extern CommandQueueCreateFunc commandQueueFactory[IGFX_MAX_CORE];
diff --git a/runtime/gen8/command_queue_gen8.cpp b/runtime/gen8/command_queue_gen8.cpp
index 0430607ef7..34685b307b 100644
--- a/runtime/gen8/command_queue_gen8.cpp
+++ b/runtime/gen8/command_queue_gen8.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2017-2018 Intel Corporation
+ * Copyright (C) 2017-2019 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -14,6 +14,8 @@ namespace OCLRT {
 typedef BDWFamily Family;
 static auto gfxCore = IGFX_GEN8_CORE;
 
+template class CommandQueueHw<Family>;
+
 template <>
 void populateFactoryTable<CommandQueueHw<Family>>() {
     extern CommandQueueCreateFunc commandQueueFactory[IGFX_MAX_CORE];
diff --git a/runtime/gen9/command_queue_gen9.cpp b/runtime/gen9/command_queue_gen9.cpp
index 5ff5c2555d..6da47fcf05 100644
--- a/runtime/gen9/command_queue_gen9.cpp
+++ b/runtime/gen9/command_queue_gen9.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2017-2018 Intel Corporation
+ * Copyright (C) 2017-2019 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -14,6 +14,8 @@ namespace OCLRT {
 typedef SKLFamily Family;
 static auto gfxCore = IGFX_GEN9_CORE;
 
+template class CommandQueueHw<Family>;
+
 template <>
 void populateFactoryTable<CommandQueueHw<Family>>() {
     extern CommandQueueCreateFunc commandQueueFactory[IGFX_MAX_CORE];
diff --git a/runtime/helpers/kernel_commands.h b/runtime/helpers/kernel_commands.h
index 867857837d..c99ba02eff 100644
--- a/runtime/helpers/kernel_commands.h
+++ b/runtime/helpers/kernel_commands.h
@@ -141,7 +141,7 @@ struct KernelCommandsHelper : public PerThreadDataHelper {
         Kernel &kernel);
 
     static size_t getSizeRequiredCS(const Kernel *kernel);
-    static size_t getSizeRequiredForCacheFlush(const Kernel *kernel, uint64_t postSyncAddress, uint64_t postSyncData);
+    static size_t getSizeRequiredForCacheFlush(const CommandQueue &commandQueue, const Kernel *kernel, uint64_t postSyncAddress, uint64_t postSyncData);
     static bool isPipeControlWArequired();
     static size_t getSizeRequiredDSH(
         const Kernel &kernel);
@@ -201,7 +201,7 @@ struct KernelCommandsHelper : public PerThreadDataHelper {
 
     static void programMiSemaphoreWait(LinearStream &commandStream, uint64_t compareAddress, uint32_t compareData);
     static MI_ATOMIC *programMiAtomic(LinearStream &commandStream, uint64_t writeAddress, typename MI_ATOMIC::ATOMIC_OPCODES opcode, typename MI_ATOMIC::DATA_SIZE dataSize);
-    static void programCacheFlushAfterWalkerCommand(LinearStream *commandStream, const Kernel *kernel, uint64_t postSyncAddress, uint64_t postSyncData);
+    static void programCacheFlushAfterWalkerCommand(LinearStream *commandStream, const CommandQueue &commandQueue, const Kernel *kernel, uint64_t postSyncAddress, uint64_t postSyncData);
 
     static const size_t alignInterfaceDescriptorData = 64 * sizeof(uint8_t);
     static const uint32_t alignIndirectStatePointer = 64 * sizeof(uint8_t);
diff --git a/runtime/helpers/kernel_commands_base.inl b/runtime/helpers/kernel_commands_base.inl
index d97423639d..1cbc938c8d 100644
--- a/runtime/helpers/kernel_commands_base.inl
+++ b/runtime/helpers/kernel_commands_base.inl
@@ -51,8 +51,8 @@ size_t KernelCommandsHelper<GfxFamily>::getSizeRequiredCS(const Kernel *kernel)
 }
 
 template <typename GfxFamily>
-size_t KernelCommandsHelper<GfxFamily>::getSizeRequiredForCacheFlush(const Kernel *kernel, uint64_t postSyncAddress, uint64_t postSyncData) {
-    return kernel->requiresCacheFlushCommand() ? sizeof(typename GfxFamily::PIPE_CONTROL) : 0;
+size_t KernelCommandsHelper<GfxFamily>::getSizeRequiredForCacheFlush(const CommandQueue &commandQueue, const Kernel *kernel, uint64_t postSyncAddress, uint64_t postSyncData) {
+    return kernel->requiresCacheFlushCommand(commandQueue) ? sizeof(typename GfxFamily::PIPE_CONTROL) : 0;
 }
 
 template <typename GfxFamily>
@@ -163,8 +163,8 @@ bool KernelCommandsHelper<GfxFamily>::isRuntimeLocalIdsGenerationRequired(uint32
 }
 
 template <typename GfxFamily>
-void KernelCommandsHelper<GfxFamily>::programCacheFlushAfterWalkerCommand(LinearStream *commandStream, const Kernel *kernel, uint64_t postSyncAddress, uint64_t postSyncData) {
-    if (kernel->requiresCacheFlushCommand()) {
+void KernelCommandsHelper<GfxFamily>::programCacheFlushAfterWalkerCommand(LinearStream *commandStream, const CommandQueue &commandQueue, const Kernel *kernel, uint64_t postSyncAddress, uint64_t postSyncData) {
+    if (kernel->requiresCacheFlushCommand(commandQueue)) {
         using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
         auto pipeControl = reinterpret_cast<PIPE_CONTROL *>(commandStream->getSpace(sizeof(PIPE_CONTROL)));
         *pipeControl = GfxFamily::cmdInitPipeControl;
diff --git a/runtime/kernel/kernel.cpp b/runtime/kernel/kernel.cpp
index c3692fbb65..7a548e590f 100644
--- a/runtime/kernel/kernel.cpp
+++ b/runtime/kernel/kernel.cpp
@@ -11,6 +11,7 @@
 #include "runtime/built_ins/built_ins.h"
 #include "runtime/built_ins/builtins_dispatch_builder.h"
 #include "runtime/command_stream/command_stream_receiver.h"
+#include "runtime/command_queue/command_queue.h"
 #include "runtime/context/context.h"
 #include "runtime/device_queue/device_queue.h"
 #include "runtime/execution_model/device_enqueue.h"
@@ -31,6 +32,7 @@
 #include "runtime/memory_manager/memory_manager.h"
 #include "runtime/memory_manager/surface.h"
 #include "runtime/os_interface/debug_settings_manager.h"
+#include "runtime/platform/platform.h"
 #include "runtime/program/kernel_info.h"
 #include "runtime/program/printf_handler.h"
 #include "runtime/sampler/sampler.h"
@@ -2142,10 +2144,16 @@ void Kernel::fillWithBuffersForAuxTranslation(MemObjsForAuxTranslation &memObjsF
     }
 }
 
-bool Kernel::requiresCacheFlushCommand() const {
+bool Kernel::requiresCacheFlushCommand(const CommandQueue &commandQueue) const {
     if (false == HwHelper::cacheFlushAfterWalkerSupported(device.getHardwareInfo())) {
         return false;
     }
+
+    bool cmdQueueRequiresCacheFlush = commandQueue.getRequiresCacheFlushAfterWalker() || DebugManager.flags.EnableCacheFlushAfterWalkerForAllQueues.get();
+    if (false == cmdQueueRequiresCacheFlush) {
+        return false;
+    }
+
     if (getProgram()->getGlobalSurface() != nullptr) {
         return true;
     }
diff --git a/runtime/kernel/kernel.h b/runtime/kernel/kernel.h
index 31028d896a..4bd63c3471 100644
--- a/runtime/kernel/kernel.h
+++ b/runtime/kernel/kernel.h
@@ -376,7 +376,7 @@ class Kernel : public BaseObject<_cl_kernel> {
 
     void fillWithBuffersForAuxTranslation(MemObjsForAuxTranslation &memObjsForAuxTranslation);
 
-    bool requiresCacheFlushCommand() const;
+    bool requiresCacheFlushCommand(const CommandQueue &commandQueue) const;
 
     using CacheFlushAllocationsVec = StackVec<GraphicsAllocation *, 32>;
     void getAllocationsForCacheFlush(CacheFlushAllocationsVec &out) const;
diff --git a/runtime/os_interface/debug_variables_base.inl b/runtime/os_interface/debug_variables_base.inl
index b99fb5fff4..00f581b402 100644
--- a/runtime/os_interface/debug_variables_base.inl
+++ b/runtime/os_interface/debug_variables_base.inl
@@ -81,6 +81,7 @@ DECLARE_DEBUG_VARIABLE(bool, DisableZeroCopyForUseHostPtr, false, "When active a
 DECLARE_DEBUG_VARIABLE(bool, DisableZeroCopyForBuffers, false, "When active all buffer allocations will not share memory with CPU.")
 DECLARE_DEBUG_VARIABLE(bool, EnableHostPtrTracking, true, "Enable host ptr tracking")
 DECLARE_DEBUG_VARIABLE(bool, DisableDcFlushInEpilogue, false, "Disable DC flush in epilogue")
+DECLARE_DEBUG_VARIABLE(bool, EnableCacheFlushAfterWalkerForAllQueues, false, "Enable cache flush after walker even if queue doesn't require it")
 
 /*FEATURE FLAGS*/
 DECLARE_DEBUG_VARIABLE(bool, EnableNV12, true, "Enables NV12 extension")
diff --git a/unit_tests/command_queue/command_queue_fixture.cpp b/unit_tests/command_queue/command_queue_fixture.cpp
index 2e682b5d4b..0d892375aa 100644
--- a/unit_tests/command_queue/command_queue_fixture.cpp
+++ b/unit_tests/command_queue/command_queue_fixture.cpp
@@ -1,15 +1,17 @@
 /*
- * Copyright (C) 2017-2018 Intel Corporation
+ * Copyright (C) 2017-2019 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
  */
 
+#include "hw_cmds.h"
 #include "unit_tests/command_queue/command_queue_fixture.h"
 #include "runtime/command_queue/command_queue_hw.h"
-#include "hw_cmds.h"
 #include "runtime/context/context.h"
 #include "runtime/device/device.h"
+#include "unit_tests/mocks/mock_device.h"
+
 #include "gtest/gtest.h"
 
 namespace OCLRT {
@@ -17,21 +19,32 @@ namespace OCLRT {
 // Global table of create functions
 extern CommandQueueCreateFunc commandQueueFactory[IGFX_MAX_CORE];
 
-CommandQueueHwFixture::CommandQueueHwFixture()
-    : pCmdQ(nullptr), context(nullptr) {
-}
-
 CommandQueue *CommandQueueHwFixture::createCommandQueue(
     Device *pDevice,
     cl_command_queue_properties properties) {
     const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, properties, 0};
 
-    auto funcCreate = commandQueueFactory[pDevice->getRenderCoreFamily()];
-    assert(nullptr != funcCreate);
+    return createCommandQueue(pDevice, props);
+}
+
+CommandQueue *CommandQueueHwFixture::createCommandQueue(
+    Device *pDevice,
+    const cl_command_queue_properties *properties) {
+
+    if (pDevice == nullptr) {
+        if (this->device == nullptr) {
+            this->device = MockDevice::createWithNewExecutionEnvironment<MockDevice>(nullptr);
+        }
+        pDevice = this->device;
+    }
+
     if (!context)
         context = new MockContext(pDevice);
 
-    return funcCreate(context, pDevice, props);
+    auto funcCreate = commandQueueFactory[pDevice->getRenderCoreFamily()];
+    assert(nullptr != funcCreate);
+
+    return funcCreate(context, pDevice, properties);
 }
 
 void CommandQueueHwFixture::SetUp() {
@@ -55,7 +68,12 @@ void CommandQueueHwFixture::TearDown() {
         UNRECOVERABLE_IF(blocked);
         pCmdQ->release();
     }
-    context->release();
+    if (context) {
+        context->release();
+    }
+    if (device) {
+        delete device;
+    }
 }
 
 CommandQueueFixture::CommandQueueFixture()
diff --git a/unit_tests/command_queue/command_queue_fixture.h b/unit_tests/command_queue/command_queue_fixture.h
index 7a089a0e76..e24fe1cb37 100644
--- a/unit_tests/command_queue/command_queue_fixture.h
+++ b/unit_tests/command_queue/command_queue_fixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2017-2018 Intel Corporation
+ * Copyright (C) 2017-2019 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -15,19 +15,26 @@ namespace OCLRT {
 class Device;
 
 struct CommandQueueHwFixture {
-    CommandQueueHwFixture();
+    CommandQueue *createCommandQueue(Device *device) {
+        return createCommandQueue(device, cl_command_queue_properties{0});
+    }
 
     CommandQueue *createCommandQueue(
         Device *device,
         cl_command_queue_properties properties);
 
+    CommandQueue *createCommandQueue(
+        Device *device,
+        const cl_command_queue_properties *properties);
+
     virtual void SetUp();
     virtual void SetUp(Device *_pDevice, cl_command_queue_properties properties);
 
     virtual void TearDown();
 
-    CommandQueue *pCmdQ;
-    MockContext *context;
+    CommandQueue *pCmdQ = nullptr;
+    Device *device = nullptr;
+    MockContext *context = nullptr;
 };
 
 struct OOQueueFixture : public CommandQueueHwFixture {
diff --git a/unit_tests/command_queue/command_queue_hw_tests.cpp b/unit_tests/command_queue/command_queue_hw_tests.cpp
index affeda5745..612ed08b30 100644
--- a/unit_tests/command_queue/command_queue_hw_tests.cpp
+++ b/unit_tests/command_queue/command_queue_hw_tests.cpp
@@ -1012,3 +1012,7 @@ HWTEST_F(CommandQueueHwTest, givenKernelSplitEnqueueReadBufferWhenBlockedThenEnq
 
     pCmdQ->isQueueBlocked();
 }
+
+HWTEST_F(CommandQueueHwTest, givenDefaultHwCommandQueueThenCacheFlushAfterWalkerIsNotNeeded) {
+    EXPECT_FALSE(pCmdQ->getRequiresCacheFlushAfterWalker());
+}
diff --git a/unit_tests/command_queue/enqueue_kernel_1_tests.cpp b/unit_tests/command_queue/enqueue_kernel_1_tests.cpp
index 6220b98ecf..7b54c95f9c 100644
--- a/unit_tests/command_queue/enqueue_kernel_1_tests.cpp
+++ b/unit_tests/command_queue/enqueue_kernel_1_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018 Intel Corporation
+ * Copyright (C) 2018-2019 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,7 +40,7 @@ TEST_F(EnqueueKernelTest, givenKernelWhenAllArgsAreSetThenClEnqueueNDRangeKernel
     size_t globalWorkSize[3] = {n, 1, 1};
     size_t localWorkSize[3] = {256, 1, 1};
     cl_int retVal = CL_SUCCESS;
-    CommandQueue *pCmdQ2 = createCommandQueue(pDevice, 0);
+    CommandQueue *pCmdQ2 = createCommandQueue(pDevice);
 
     std::unique_ptr<Kernel> kernel(Kernel::create(pProgram, *pProgram->getKernelInfo("CopyBuffer"), &retVal));
     EXPECT_EQ(CL_SUCCESS, retVal);
@@ -79,7 +79,7 @@ TEST_F(EnqueueKernelTest, givenKernelWhenNotAllArgsAreSetButSetKernelArgIsCalled
     size_t globalWorkSize[3] = {n, 1, 1};
     size_t localWorkSize[3] = {256, 1, 1};
     cl_int retVal = CL_SUCCESS;
-    CommandQueue *pCmdQ2 = createCommandQueue(pDevice, 0);
+    CommandQueue *pCmdQ2 = createCommandQueue(pDevice);
 
     std::unique_ptr<Kernel> kernel(Kernel::create(pProgram, *pProgram->getKernelInfo("CopyBuffer"), &retVal));
     EXPECT_EQ(CL_SUCCESS, retVal);
@@ -118,7 +118,7 @@ TEST_F(EnqueueKernelTest, givenKernelWhenSetKernelArgIsCalledForEachArgButAtLeas
     size_t globalWorkSize[3] = {n, 1, 1};
     size_t localWorkSize[3] = {256, 1, 1};
     cl_int retVal = CL_SUCCESS;
-    CommandQueue *pCmdQ2 = createCommandQueue(pDevice, 0);
+    CommandQueue *pCmdQ2 = createCommandQueue(pDevice);
 
     std::unique_ptr<Kernel> kernel(Kernel::create(pProgram, *pProgram->getKernelInfo("CopyBuffer"), &retVal));
     EXPECT_EQ(CL_SUCCESS, retVal);
@@ -380,7 +380,7 @@ HWTEST_F(EnqueueKernelTest, givenReducedAddressSpaceGraphicsAllocationForHostPtr
     MockKernelWithInternals mockKernel(*device, context);
     size_t gws[3] = {1, 0, 0};
     mockCsr->makeResident(*allocation);
-    cmdQ.reset(createCommandQueue(device.get(), 0));
+    cmdQ.reset(createCommandQueue(device.get()));
     auto ret = cmdQ->enqueueKernel(mockKernel.mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr);
     EXPECT_EQ(CL_SUCCESS, ret);
     EXPECT_TRUE(mockCsr->passedDispatchFlags.dcFlush);
@@ -402,7 +402,7 @@ HWTEST_F(EnqueueKernelTest, givenReducedAddressSpaceGraphicsAllocationForHostPtr
     MockKernelWithInternals mockKernel(*device, context);
     size_t gws[3] = {1, 0, 0};
     mockCsr->makeResident(*allocation);
-    cmdQ.reset(createCommandQueue(device.get(), 0));
+    cmdQ.reset(createCommandQueue(device.get()));
     auto ret = cmdQ->enqueueKernel(mockKernel.mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr);
     EXPECT_EQ(CL_SUCCESS, ret);
     EXPECT_FALSE(mockCsr->passedDispatchFlags.dcFlush);
@@ -425,7 +425,7 @@ HWTEST_F(EnqueueKernelTest, givenFullAddressSpaceGraphicsAllocationWhenEnqueueKe
     MockKernelWithInternals mockKernel(*device, context);
     size_t gws[3] = {1, 0, 0};
     mockCsr->makeResident(*allocation);
-    cmdQ.reset(createCommandQueue(device.get(), 0));
+    cmdQ.reset(createCommandQueue(device.get()));
     auto ret = cmdQ->enqueueKernel(mockKernel.mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr);
     EXPECT_EQ(CL_SUCCESS, ret);
     EXPECT_FALSE(mockCsr->passedDispatchFlags.dcFlush);
@@ -433,7 +433,7 @@ HWTEST_F(EnqueueKernelTest, givenFullAddressSpaceGraphicsAllocationWhenEnqueueKe
 
     allocation = (memoryManager->allocateGraphicsMemoryForHostPtr(1, hostPtr, device->isFullRangeSvm(), true));
     mockCsr->makeResident(*allocation);
-    cmdQ.reset(createCommandQueue(device.get(), 0));
+    cmdQ.reset(createCommandQueue(device.get()));
     ret = cmdQ->enqueueKernel(mockKernel.mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr);
     EXPECT_EQ(CL_SUCCESS, ret);
     EXPECT_FALSE(mockCsr->passedDispatchFlags.dcFlush);
@@ -944,7 +944,7 @@ TEST_F(EnqueueKernelTest, givenKernelWhenAllArgsAreNotAndEventExistSetThenClEnqu
     size_t globalWorkSize[3] = {n, 1, 1};
     size_t localWorkSize[3] = {256, 1, 1};
     cl_int retVal = CL_SUCCESS;
-    CommandQueue *pCmdQ2 = createCommandQueue(pDevice, 0);
+    CommandQueue *pCmdQ2 = createCommandQueue(pDevice);
 
     std::unique_ptr<Kernel> kernel(Kernel::create(pProgram, *pProgram->getKernelInfo("CopyBuffer"), &retVal));
     EXPECT_EQ(CL_SUCCESS, retVal);
diff --git a/unit_tests/command_queue/enqueue_kernel_2_tests.cpp b/unit_tests/command_queue/enqueue_kernel_2_tests.cpp
index c173bfcaa7..7b0d2ec25a 100644
--- a/unit_tests/command_queue/enqueue_kernel_2_tests.cpp
+++ b/unit_tests/command_queue/enqueue_kernel_2_tests.cpp
@@ -864,6 +864,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, EnqueueKernelTest, givenCacheFlushAfterWalkerEnabled
 
     DebugManagerStateRestore dbgRestore;
     DebugManager.flags.EnableCacheFlushAfterWalker.set(1);
+    DebugManager.flags.EnableCacheFlushAfterWalkerForAllQueues.set(1);
 
     MockKernelWithInternals mockKernel(*pDevice, context);
     CommandQueueHw<FamilyType> cmdQ(context, pDevice, nullptr);
diff --git a/unit_tests/command_queue/enqueue_kernel_two_ioq_tests.cpp b/unit_tests/command_queue/enqueue_kernel_two_ioq_tests.cpp
index 9d90fb004f..98688681ac 100644
--- a/unit_tests/command_queue/enqueue_kernel_two_ioq_tests.cpp
+++ b/unit_tests/command_queue/enqueue_kernel_two_ioq_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2017-2018 Intel Corporation
+ * Copyright (C) 2017-2019 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -58,7 +58,7 @@ struct TwoIOQsTwoDependentWalkers : public HelloWorldTest<HelloWorldFixtureFacto
         HardwareParse::parseCommands<FamilyType>(*pCmdQ);
 
         // Create a second command queue (beyond the default one)
-        pCmdQ2 = createCommandQueue(pDevice, 0);
+        pCmdQ2 = createCommandQueue(pDevice);
         ASSERT_NE(nullptr, pCmdQ2);
 
         retVal = pCmdQ2->enqueueKernel(
diff --git a/unit_tests/context/driver_diagnostics_tests.h b/unit_tests/context/driver_diagnostics_tests.h
index 8f465e0a08..3c2d1b93a9 100644
--- a/unit_tests/context/driver_diagnostics_tests.h
+++ b/unit_tests/context/driver_diagnostics_tests.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2017-2018 Intel Corporation
+ * Copyright (C) 2017-2019 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -130,7 +130,7 @@ struct PerformanceHintCommandQueueTest : public PerformanceHintTest,
 struct PerformanceHintEnqueueTest : public PerformanceHintTest {
     void SetUp() override {
         PerformanceHintTest::SetUp();
-        pCmdQ = createCommandQueue(pPlatform->getDevice(0), 0);
+        pCmdQ = createCommandQueue(pPlatform->getDevice(0));
     }
 
     void TearDown() override {
diff --git a/unit_tests/helpers/kernel_commands_tests.cpp b/unit_tests/helpers/kernel_commands_tests.cpp
index 6f736e34b1..01ea75c880 100644
--- a/unit_tests/helpers/kernel_commands_tests.cpp
+++ b/unit_tests/helpers/kernel_commands_tests.cpp
@@ -1233,6 +1233,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, givenCacheFlushAfterWalkerEnable
 
     DebugManagerStateRestore dbgRestore;
     DebugManager.flags.EnableCacheFlushAfterWalker.set(1);
+    DebugManager.flags.EnableCacheFlushAfterWalkerForAllQueues.set(1);
 
     CommandQueueHw<FamilyType> cmdQ(nullptr, pDevice, 0);
     auto &commandStream = cmdQ.getCS(1024);
@@ -1245,10 +1246,10 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, givenCacheFlushAfterWalkerEnable
     EXPECT_NE(allocs.end(), std::find(allocs.begin(), allocs.end(), &globalAllocation));
 
     size_t expectedSize = sizeof(PIPE_CONTROL);
-    size_t actualSize = KernelCommandsHelper<FamilyType>::getSizeRequiredForCacheFlush(mockKernelWithInternal->mockKernel, 0U, 0U);
+    size_t actualSize = KernelCommandsHelper<FamilyType>::getSizeRequiredForCacheFlush(cmdQ, mockKernelWithInternal->mockKernel, 0U, 0U);
     EXPECT_EQ(expectedSize, actualSize);
 
-    KernelCommandsHelper<FamilyType>::programCacheFlushAfterWalkerCommand(&commandStream, mockKernelWithInternal->mockKernel, 0U, 0U);
+    KernelCommandsHelper<FamilyType>::programCacheFlushAfterWalkerCommand(&commandStream, cmdQ, mockKernelWithInternal->mockKernel, 0U, 0U);
 
     HardwareParse hwParse;
     hwParse.parseCommands<FamilyType>(commandStream);
@@ -1267,6 +1268,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, givenCacheFlushAfterWalkerEnable
 
     DebugManagerStateRestore dbgRestore;
     DebugManager.flags.EnableCacheFlushAfterWalker.set(1);
+    DebugManager.flags.EnableCacheFlushAfterWalkerForAllQueues.set(1);
 
     CommandQueueHw<FamilyType> cmdQ(nullptr, pDevice, 0);
     auto &commandStream = cmdQ.getCS(1024);
@@ -1285,10 +1287,10 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, givenCacheFlushAfterWalkerEnable
     EXPECT_EQ(allocs.end(), std::find(allocs.begin(), allocs.end(), &svmAllocation2));
 
     size_t expectedSize = sizeof(PIPE_CONTROL);
-    size_t actualSize = KernelCommandsHelper<FamilyType>::getSizeRequiredForCacheFlush(mockKernelWithInternal->mockKernel, 0U, 0U);
+    size_t actualSize = KernelCommandsHelper<FamilyType>::getSizeRequiredForCacheFlush(cmdQ, mockKernelWithInternal->mockKernel, 0U, 0U);
     EXPECT_EQ(expectedSize, actualSize);
 
-    KernelCommandsHelper<FamilyType>::programCacheFlushAfterWalkerCommand(&commandStream, mockKernelWithInternal->mockKernel, 0U, 0U);
+    KernelCommandsHelper<FamilyType>::programCacheFlushAfterWalkerCommand(&commandStream, cmdQ, mockKernelWithInternal->mockKernel, 0U, 0U);
 
     HardwareParse hwParse;
     hwParse.parseCommands<FamilyType>(commandStream);
@@ -1305,6 +1307,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, givenCacheFlushAfterWalkerDisabl
 
     DebugManagerStateRestore dbgRestore;
     DebugManager.flags.EnableCacheFlushAfterWalker.set(0);
+    DebugManager.flags.EnableCacheFlushAfterWalkerForAllQueues.set(1);
 
     CommandQueueHw<FamilyType> cmdQ(nullptr, pDevice, 0);
     auto &commandStream = cmdQ.getCS(1024);
@@ -1312,10 +1315,10 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, givenCacheFlushAfterWalkerDisabl
     mockKernelWithInternal->mockKernel->svmAllocationsRequireCacheFlush = true;
 
     size_t expectedSize = 0U;
-    size_t actualSize = KernelCommandsHelper<FamilyType>::getSizeRequiredForCacheFlush(mockKernelWithInternal->mockKernel, 0U, 0U);
+    size_t actualSize = KernelCommandsHelper<FamilyType>::getSizeRequiredForCacheFlush(cmdQ, mockKernelWithInternal->mockKernel, 0U, 0U);
     EXPECT_EQ(expectedSize, actualSize);
 
-    KernelCommandsHelper<FamilyType>::programCacheFlushAfterWalkerCommand(&commandStream, mockKernelWithInternal->mockKernel, 0U, 0U);
+    KernelCommandsHelper<FamilyType>::programCacheFlushAfterWalkerCommand(&commandStream, cmdQ, mockKernelWithInternal->mockKernel, 0U, 0U);
 
     HardwareParse hwParse;
     hwParse.parseCommands<FamilyType>(commandStream);
@@ -1330,6 +1333,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, givenCacheFlushAfterWalkerEnable
 
     DebugManagerStateRestore dbgRestore;
     DebugManager.flags.EnableCacheFlushAfterWalker.set(1);
+    DebugManager.flags.EnableCacheFlushAfterWalkerForAllQueues.set(1);
 
     CommandQueueHw<FamilyType> cmdQ(nullptr, pDevice, 0);
     auto &commandStream = cmdQ.getCS(1024);
@@ -1344,10 +1348,10 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, givenCacheFlushAfterWalkerEnable
     EXPECT_NE(allocs.end(), std::find(allocs.begin(), allocs.end(), &cacheRequiringAllocation));
 
     size_t expectedSize = sizeof(PIPE_CONTROL);
-    size_t actualSize = KernelCommandsHelper<FamilyType>::getSizeRequiredForCacheFlush(mockKernelWithInternal->mockKernel, 0U, 0U);
+    size_t actualSize = KernelCommandsHelper<FamilyType>::getSizeRequiredForCacheFlush(cmdQ, mockKernelWithInternal->mockKernel, 0U, 0U);
     EXPECT_EQ(expectedSize, actualSize);
 
-    KernelCommandsHelper<FamilyType>::programCacheFlushAfterWalkerCommand(&commandStream, mockKernelWithInternal->mockKernel, 0U, 0U);
+    KernelCommandsHelper<FamilyType>::programCacheFlushAfterWalkerCommand(&commandStream, cmdQ, mockKernelWithInternal->mockKernel, 0U, 0U);
 
     HardwareParse hwParse;
     hwParse.parseCommands<FamilyType>(commandStream);
@@ -1364,6 +1368,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, givenCacheFlushAfterWalkerEnable
 
     DebugManagerStateRestore dbgRestore;
     DebugManager.flags.EnableCacheFlushAfterWalker.set(1);
+    DebugManager.flags.EnableCacheFlushAfterWalkerForAllQueues.set(1);
 
     CommandQueueHw<FamilyType> cmdQ(nullptr, pDevice, 0);
     auto &commandStream = cmdQ.getCS(1024);
@@ -1371,10 +1376,10 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, givenCacheFlushAfterWalkerEnable
     addSpaceForSingleKernelArg();
 
     size_t expectedSize = 0U;
-    size_t actualSize = KernelCommandsHelper<FamilyType>::getSizeRequiredForCacheFlush(mockKernelWithInternal->mockKernel, 0U, 0U);
+    size_t actualSize = KernelCommandsHelper<FamilyType>::getSizeRequiredForCacheFlush(cmdQ, mockKernelWithInternal->mockKernel, 0U, 0U);
     EXPECT_EQ(expectedSize, actualSize);
 
-    KernelCommandsHelper<FamilyType>::programCacheFlushAfterWalkerCommand(&commandStream, mockKernelWithInternal->mockKernel, 0U, 0U);
+    KernelCommandsHelper<FamilyType>::programCacheFlushAfterWalkerCommand(&commandStream, cmdQ, mockKernelWithInternal->mockKernel, 0U, 0U);
 
     HardwareParse hwParse;
     hwParse.parseCommands<FamilyType>(commandStream);
@@ -1389,6 +1394,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, givenCacheFlushAfterWalkerEnable
 
     DebugManagerStateRestore dbgRestore;
     DebugManager.flags.EnableCacheFlushAfterWalker.set(-1);
+    DebugManager.flags.EnableCacheFlushAfterWalkerForAllQueues.set(1);
     hwInfoHelper.capabilityTable.supportCacheFlushAfterWalker = false;
 
     CommandQueueHw<FamilyType> cmdQ(nullptr, pDevice, 0);
@@ -1403,10 +1409,10 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, givenCacheFlushAfterWalkerEnable
     EXPECT_EQ(0U, allocationsForCacheFlush.size());
 
     size_t expectedSize = 0U;
-    size_t actualSize = KernelCommandsHelper<FamilyType>::getSizeRequiredForCacheFlush(mockKernelWithInternal->mockKernel, 0U, 0U);
+    size_t actualSize = KernelCommandsHelper<FamilyType>::getSizeRequiredForCacheFlush(cmdQ, mockKernelWithInternal->mockKernel, 0U, 0U);
     EXPECT_EQ(expectedSize, actualSize);
 
-    KernelCommandsHelper<FamilyType>::programCacheFlushAfterWalkerCommand(&commandStream, mockKernelWithInternal->mockKernel, 0U, 0U);
+    KernelCommandsHelper<FamilyType>::programCacheFlushAfterWalkerCommand(&commandStream, cmdQ, mockKernelWithInternal->mockKernel, 0U, 0U);
 
     HardwareParse hwParse;
     hwParse.parseCommands<FamilyType>(commandStream);
diff --git a/unit_tests/kernel/kernel_tests.cpp b/unit_tests/kernel/kernel_tests.cpp
index 65e2c8487a..715814b8c3 100644
--- a/unit_tests/kernel/kernel_tests.cpp
+++ b/unit_tests/kernel/kernel_tests.cpp
@@ -23,6 +23,7 @@
 #include "unit_tests/fixtures/memory_management_fixture.h"
 #include "unit_tests/helpers/debug_manager_state_restore.h"
 #include "unit_tests/helpers/gtest_helpers.h"
+#include "unit_tests/mocks/mock_command_queue.h"
 #include "unit_tests/mocks/mock_graphics_allocation.h"
 #include "unit_tests/mocks/mock_kernel.h"
 #include "unit_tests/mocks/mock_program.h"
@@ -2384,6 +2385,43 @@ TEST(KernelTest, whenAllocationRequiringCacheFlushThenAssignAllocationPointerToC
     EXPECT_EQ(&mockAllocation, kernel.mockKernel->kernelArgRequiresCacheFlush[0]);
 }
 
+TEST(KernelTest, whenQueueAndKernelRequireCacheFlushAfterWalkerThenRequireCacheFlushAfterWalker) {
+    MockGraphicsAllocation mockAllocation;
+    auto device = std::unique_ptr<MockDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(platformDevices[0]));
+    MockKernelWithInternals kernel(*device);
+    kernel.mockKernel->svmAllocationsRequireCacheFlush = true;
+
+    MockCommandQueue queue;
+
+    DebugManagerStateRestore debugRestore;
+    DebugManager.flags.EnableCacheFlushAfterWalker.set(true);
+
+    queue.requiresCacheFlushAfterWalker = true;
+    EXPECT_TRUE(kernel.mockKernel->requiresCacheFlushCommand(queue));
+
+    queue.requiresCacheFlushAfterWalker = false;
+    EXPECT_FALSE(kernel.mockKernel->requiresCacheFlushCommand(queue));
+}
+
+TEST(KernelTest, whenCacheFlushEnabledForAllQueuesAndKernelRequireCacheFlushAfterWalkerThenRequireCacheFlushAfterWalker) {
+    MockGraphicsAllocation mockAllocation;
+    auto device = std::unique_ptr<MockDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(platformDevices[0]));
+    MockKernelWithInternals kernel(*device);
+    kernel.mockKernel->svmAllocationsRequireCacheFlush = true;
+
+    MockCommandQueue queue;
+
+    DebugManagerStateRestore debugRestore;
+    DebugManager.flags.EnableCacheFlushAfterWalkerForAllQueues.set(true);
+    DebugManager.flags.EnableCacheFlushAfterWalker.set(true);
+
+    queue.requiresCacheFlushAfterWalker = true;
+    EXPECT_TRUE(kernel.mockKernel->requiresCacheFlushCommand(queue));
+
+    queue.requiresCacheFlushAfterWalker = false;
+    EXPECT_TRUE(kernel.mockKernel->requiresCacheFlushCommand(queue));
+}
+
 TEST(KernelTest, whenAllocationWriteableThenAssignAllocationPointerToCacheFlushVector) {
     MockGraphicsAllocation mockAllocation;
     auto device = std::unique_ptr<MockDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(platformDevices[0]));
diff --git a/unit_tests/mocks/mock_command_queue.h b/unit_tests/mocks/mock_command_queue.h
index e43a12aabc..ae4248525d 100644
--- a/unit_tests/mocks/mock_command_queue.h
+++ b/unit_tests/mocks/mock_command_queue.h
@@ -18,6 +18,7 @@ class MockCommandQueue : public CommandQueue {
   public:
     using CommandQueue::device;
     using CommandQueue::obtainNewTimestampPacketNodes;
+    using CommandQueue::requiresCacheFlushAfterWalker;
     using CommandQueue::throttle;
     using CommandQueue::timestampPacketContainer;
 
diff --git a/unit_tests/test_files/igdrcl.config b/unit_tests/test_files/igdrcl.config
index 272378f0ca..24cbe735c8 100644
--- a/unit_tests/test_files/igdrcl.config
+++ b/unit_tests/test_files/igdrcl.config
@@ -107,3 +107,4 @@ AUBDumpForceAllToLocalMemory = 0
 EnableCacheFlushAfterWalker = 0
 EnableHostPtrTracking = 1
 DisableDcFlushInEpilogue = 0
+EnableCacheFlushAfterWalkerForAllQueues = 0