Add cache flush command after WALKER command

Change-Id: I3983dc6c0797047e17cc8189655a22a22e85892b
2018-12-06 15:33:02 +01:00 · 2018-12-06 15:33:02 +01:00 · 3dca095ccf
parent 9e81469d9f
commit 3dca095ccf
46 changed files with 1008 additions and 408 deletions
--- a/runtime/api/api.cpp
+++ b/runtime/api/api.cpp
@ -3405,7 +3405,8 @@ void *CL_API_CALL clSVMAlloc(cl_context context,
        return pAlloc;
    }

-    pAlloc = pContext->getSVMAllocsManager()->createSVMAlloc(size, !!(flags & CL_MEM_SVM_FINE_GRAIN_BUFFER));
+    pAlloc = pContext->getSVMAllocsManager()->createSVMAlloc(size, !!(flags & CL_MEM_SVM_FINE_GRAIN_BUFFER),
+                                                             SVMAllocsManager::memFlagIsReadOnly(flags));

    if (pContext->isProvidingPerformanceHints()) {
        pContext->providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL, CL_SVM_ALLOC_MEETS_ALIGNMENT_RESTRICTIONS, pAlloc, size);
--- a/runtime/command_queue/gpgpu_walker.inl
+++ b/runtime/command_queue/gpgpu_walker.inl
@ -418,7 +418,7 @@ size_t EnqueueOperation<GfxFamily>::getSizeRequiredCS(uint32_t cmdType, bool res

 template <typename GfxFamily>
 size_t EnqueueOperation<GfxFamily>::getSizeRequiredCSKernel(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel) {
-    size_t size = sizeof(typename GfxFamily::GPGPU_WALKER) + KernelCommandsHelper<GfxFamily>::getSizeRequiredCS() +
+    size_t size = sizeof(typename GfxFamily::GPGPU_WALKER) + KernelCommandsHelper<GfxFamily>::getSizeRequiredCS(pKernel) +
                  sizeof(PIPE_CONTROL) * (KernelCommandsHelper<GfxFamily>::isPipeControlWArequired() ? 2 : 1);
    size += PreemptionHelper::getPreemptionWaCsSize<GfxFamily>(commandQueue.getDevice());
    if (reserveProfilingCmdsSpace) {
--- a/runtime/command_queue/hardware_interface.inl
+++ b/runtime/command_queue/hardware_interface.inl
@ -191,6 +191,8 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
        // Program the walker.  Invokes execution so all state should already be programmed
        auto walkerCmd = allocateWalkerSpace(*commandStream, kernel);

+        KernelCommandsHelper<GfxFamily>::programCacheFlushAfterWalkerCommand(commandStream, &kernel);
+
        if (currentTimestampPacketNodes && commandQueue.getCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
            auto timestampPacket = currentTimestampPacketNodes->peekNodes().at(currentDispatchIndex)->tag;
            GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(commandStream, walkerCmd, timestampPacket, TimestampPacket::WriteOperationType::AfterWalker);
--- a/runtime/gen10/hw_info_cnl.inl
+++ b/runtime/gen10/hw_info_cnl.inl
@ -38,33 +38,36 @@ const PLATFORM CNL::platform = {
    0,             // usRevId_PCH
    GTTYPE_UNDEFINED};

-const RuntimeCapabilityTable CNL::capabilityTable{0,
-                                                  83.333,
-                                                  21,
-                                                  true,
-                                                  true,
-                                                  true,
-                                                  true,
-                                                  true,  // ftrSupportsVmeAvcTextureSampler
-                                                  true,  // ftrSupportsVmeAvcPreemption
-                                                  false, // ftrRenderCompressedBuffers
-                                                  false, // ftrRenderCompressedImages
-                                                  PreemptionMode::MidThread,
-                                                  {true, true},
-                                                  &isSimulationCNL,
-                                                  true,
-                                                  true,                           // forceStatelessCompilationFor32Bit
-                                                  {false, 0, false, 0, false, 0}, // KmdNotifyProperties
-                                                  true,                           // ftr64KBpages
-                                                  EngineType::ENGINE_RCS,         // defaultEngineType
-                                                  MemoryConstants::pageSize,      // requiredPreemptionSurfaceSize
-                                                  true,
-                                                  true, // sourceLevelDebuggerSupported
-                                                  CmdServicesMemTraceVersion::DeviceValues::Cnl,
-                                                  0,                                 // extraQuantityThreadsPerEU
-                                                  true,                              // SupportsVme
-                                                  64,                                // slmSize
-                                                  MemoryConstants::max48BitAddress}; // gpuAddressSpace
+const RuntimeCapabilityTable CNL::capabilityTable{
+    {0, 0, 0, false, false, false},                // kmdNotifyProperties
+    {true, true},                                  // whitelistedRegisters
+    MemoryConstants::max48BitAddress,              // gpuAddressSpace
+    83.333,                                        // defaultProfilingTimerResolution
+    MemoryConstants::pageSize,                     // requiredPreemptionSurfaceSize
+    &isSimulationCNL,                              // isSimulation
+    PreemptionMode::MidThread,                     // defaultPreemptionMode
+    EngineType::ENGINE_RCS,                        // defaultEngineType
+    0,                                             // maxRenderFrequency
+    21,                                            // clVersionSupport
+    CmdServicesMemTraceVersion::DeviceValues::Cnl, // aubDeviceId
+    0,                                             // extraQuantityThreadsPerEU
+    64,                                            // slmSize
+    true,                                          // ftrSupportsFP64
+    true,                                          // ftrSupports64BitMath
+    true,                                          // ftrSvm
+    true,                                          // ftrSupportsCoherency
+    true,                                          // ftrSupportsVmeAvcTextureSampler
+    true,                                          // ftrSupportsVmeAvcPreemption
+    false,                                         // ftrRenderCompressedBuffers
+    false,                                         // ftrRenderCompressedImages
+    true,                                          // ftr64KBpages
+    true,                                          // instrumentationEnabled
+    true,                                          // forceStatelessCompilationFor32Bit
+    true,                                          // isCore
+    true,                                          // sourceLevelDebuggerSupported
+    true,                                          // supportsVme
+    false                                          // supportCacheFlushAfterWalker
+};

 const HardwareInfo CNL_2x5x8::hwInfo = {
    &CNL::platform,
--- a/runtime/gen8/gpgpu_walker_gen8.cpp
+++ b/runtime/gen8/gpgpu_walker_gen8.cpp
@ -43,7 +43,7 @@ size_t GpgpuWalkerHelper<BDWFamily>::getSizeForWADisableLSQCROPERFforOCL(const K
    typedef typename BDWFamily::MI_MATH MI_MATH;
    typedef typename BDWFamily::MI_MATH_ALU_INST_INLINE MI_MATH_ALU_INST_INLINE;
    size_t n = 0;
-    if ((pKernel != nullptr) && pKernel->getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) {
+    if (pKernel->getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) {
        n += sizeof(PIPE_CONTROL) +
             (2 * sizeof(MI_LOAD_REGISTER_REG) +
              sizeof(MI_LOAD_REGISTER_IMM) +
--- a/runtime/gen8/hw_info_bdw.inl
+++ b/runtime/gen8/hw_info_bdw.inl
@ -39,33 +39,36 @@ const PLATFORM BDW::platform = {
    0,             // usRevId_PCH
    GTTYPE_UNDEFINED};

-const RuntimeCapabilityTable BDW::capabilityTable{0,
-                                                  80,
-                                                  21,
-                                                  true,
-                                                  true,
-                                                  true,
-                                                  true,
-                                                  false, // ftrSupportsVmeAvcTextureSampler
-                                                  false, // ftrSupportsVmeAvcPreemption
-                                                  false, // ftrRenderCompressedBuffers
-                                                  false, // ftrRenderCompressedImages
-                                                  PreemptionMode::Disabled,
-                                                  {false, false},
-                                                  &isSimulationBDW,
-                                                  true,
-                                                  true,                                    // forceStatelessCompilationFor32Bit
-                                                  {true, 50000, true, 5000, true, 200000}, // KmdNotifyProperties
-                                                  false,                                   // ftr64KBpages
-                                                  EngineType::ENGINE_RCS,                  // defaultEngineType
-                                                  MemoryConstants::pageSize,               // requiredPreemptionSurfaceSize
-                                                  true,                                    // isCore
-                                                  false,                                   // sourceLevelDebuggerSupported
-                                                  CmdServicesMemTraceVersion::DeviceValues::Bdw,
-                                                  0,                                 // extraQuantityThreadsPerEU
-                                                  true,                              // SupportsVme
-                                                  64,                                // slmSize
-                                                  MemoryConstants::max48BitAddress}; // gpuAddressSpace
+const RuntimeCapabilityTable BDW::capabilityTable{
+    {50000, 5000, 200000, true, true, true},       // kmdNotifyProperties
+    {false, false},                                // whitelistedRegisters
+    MemoryConstants::max48BitAddress,              // gpuAddressSpace
+    80,                                            // defaultProfilingTimerResolution
+    MemoryConstants::pageSize,                     // requiredPreemptionSurfaceSize
+    &isSimulationBDW,                              // isSimulation
+    PreemptionMode::Disabled,                      // defaultPreemptionMode
+    EngineType::ENGINE_RCS,                        // defaultEngineType
+    0,                                             // maxRenderFrequency
+    21,                                            // clVersionSupport
+    CmdServicesMemTraceVersion::DeviceValues::Bdw, // aubDeviceId
+    0,                                             // extraQuantityThreadsPerEU
+    64,                                            // slmSize
+    true,                                          // ftrSupportsFP64
+    true,                                          // ftrSupports64BitMath
+    true,                                          // ftrSvm
+    true,                                          // ftrSupportsCoherency
+    false,                                         // ftrSupportsVmeAvcTextureSampler
+    false,                                         // ftrSupportsVmeAvcPreemption
+    false,                                         // ftrRenderCompressedBuffers
+    false,                                         // ftrRenderCompressedImages
+    false,                                         // ftr64KBpages
+    true,                                          // instrumentationEnabled
+    true,                                          // forceStatelessCompilationFor32Bit
+    true,                                          // isCore
+    false,                                         // sourceLevelDebuggerSupported
+    true,                                          // supportsVme
+    false                                          // supportCacheFlushAfterWalker
+};

 const HardwareInfo BDW_1x2x6::hwInfo = {
    &BDW::platform,
--- a/runtime/gen9/gpgpu_walker_gen9.cpp
+++ b/runtime/gen9/gpgpu_walker_gen9.cpp
@ -43,7 +43,7 @@ size_t GpgpuWalkerHelper<SKLFamily>::getSizeForWADisableLSQCROPERFforOCL(const K
    typedef typename SKLFamily::MI_MATH MI_MATH;
    typedef typename SKLFamily::MI_MATH_ALU_INST_INLINE MI_MATH_ALU_INST_INLINE;
    size_t n = 0;
-    if ((pKernel != nullptr) && pKernel->getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) {
+    if (pKernel->getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) {
        n += sizeof(PIPE_CONTROL) +
             (2 * sizeof(MI_LOAD_REGISTER_REG) +
              sizeof(MI_LOAD_REGISTER_IMM) +
--- a/runtime/gen9/hw_info_bxt.inl
+++ b/runtime/gen9/hw_info_bxt.inl
@ -36,33 +36,36 @@ const PLATFORM BXT::platform = {
    0,               // usRevId_PCH
    GTTYPE_UNDEFINED};

-const RuntimeCapabilityTable BXT::capabilityTable{0,
-                                                  52.083,
-                                                  12,
-                                                  true,
-                                                  true,
-                                                  false, // ftrSvm
-                                                  true,
-                                                  true,  // ftrSupportsVmeAvcTextureSampler
-                                                  false, // ftrSupportsVmeAvcPreemption
-                                                  false, // ftrRenderCompressedBuffers
-                                                  false, // ftrRenderCompressedImages
-                                                  PreemptionMode::MidThread,
-                                                  {true, false},
-                                                  &isSimulationBXT,
-                                                  true,
-                                                  false,                          // forceStatelessCompilationFor32Bit
-                                                  {false, 0, false, 0, false, 0}, // KmdNotifyProperties
-                                                  false,                          // ftr64KBpages
-                                                  EngineType::ENGINE_RCS,         // defaultEngineType
-                                                  MemoryConstants::pageSize,      // requiredPreemptionSurfaceSize
-                                                  false,                          // isCore
-                                                  true,                           // sourceLevelDebuggerSupported
-                                                  CmdServicesMemTraceVersion::DeviceValues::Bxt,
-                                                  0,                                 // extraQuantityThreadsPerEU
-                                                  true,                              // SupportsVme
-                                                  64,                                // slmSize
-                                                  MemoryConstants::max48BitAddress}; // gpuAddressSpace
+const RuntimeCapabilityTable BXT::capabilityTable{
+    {0, 0, 0, false, false, false},                // kmdNotifyProperties
+    {true, false},                                 // whitelistedRegisters
+    MemoryConstants::max48BitAddress,              // gpuAddressSpace
+    52.083,                                        // defaultProfilingTimerResolution
+    MemoryConstants::pageSize,                     // requiredPreemptionSurfaceSize
+    &isSimulationBXT,                              // isSimulation
+    PreemptionMode::MidThread,                     // defaultPreemptionMode
+    EngineType::ENGINE_RCS,                        // defaultEngineType
+    0,                                             // maxRenderFrequency
+    12,                                            // clVersionSupport
+    CmdServicesMemTraceVersion::DeviceValues::Bxt, // aubDeviceId
+    0,                                             // extraQuantityThreadsPerEU
+    64,                                            // slmSize
+    true,                                          // ftrSupportsFP64
+    true,                                          // ftrSupports64BitMath
+    false,                                         // ftrSvm
+    true,                                          // ftrSupportsCoherency
+    true,                                          // ftrSupportsVmeAvcTextureSampler
+    false,                                         // ftrSupportsVmeAvcPreemption
+    false,                                         // ftrRenderCompressedBuffers
+    false,                                         // ftrRenderCompressedImages
+    false,                                         // ftr64KBpages
+    true,                                          // instrumentationEnabled
+    false,                                         // forceStatelessCompilationFor32Bit
+    false,                                         // isCore
+    true,                                          // sourceLevelDebuggerSupported
+    true,                                          // supportsVme
+    false                                          // supportCacheFlushAfterWalker
+};

 const HardwareInfo BXT_1x2x6::hwInfo = {
    &BXT::platform,
--- a/runtime/gen9/hw_info_cfl.inl
+++ b/runtime/gen9/hw_info_cfl.inl
@ -31,33 +31,36 @@ const PLATFORM CFL::platform = {
    0,             // usRevId_PCH
    GTTYPE_UNDEFINED};

-const RuntimeCapabilityTable CFL::capabilityTable{0,
-                                                  83.333,
-                                                  21,
-                                                  true,
-                                                  true,
-                                                  true,
-                                                  true,
-                                                  true,  // ftrSupportsVmeAvcTextureSampler
-                                                  false, // ftrSupportsVmeAvcPreemption
-                                                  false, // ftrRenderCompressedBuffers
-                                                  false, // ftrRenderCompressedImages
-                                                  PreemptionMode::MidThread,
-                                                  {true, false},
-                                                  &isSimulationCFL,
-                                                  true,
-                                                  true,                           // forceStatelessCompilationFor32Bit
-                                                  {false, 0, false, 0, false, 0}, // KmdNotifyProperties
-                                                  true,                           // ftr64KBpages
-                                                  EngineType::ENGINE_RCS,         // defaultEngineType
-                                                  MemoryConstants::pageSize,      // requiredPreemptionSurfaceSize
-                                                  true,                           // isCore
-                                                  true,                           // sourceLevelDebuggerSupported
-                                                  CmdServicesMemTraceVersion::DeviceValues::Cfl,
-                                                  0,                                 // extraQuantityThreadsPerEU
-                                                  true,                              // SupportsVme
-                                                  64,                                // slmSize
-                                                  MemoryConstants::max48BitAddress}; // gpuAddressSpace
+const RuntimeCapabilityTable CFL::capabilityTable{
+    {0, 0, 0, false, false, false},                // kmdNotifyProperties
+    {true, false},                                 // whitelistedRegisters
+    MemoryConstants::max48BitAddress,              // gpuAddressSpace
+    83.333,                                        // defaultProfilingTimerResolution
+    MemoryConstants::pageSize,                     // requiredPreemptionSurfaceSize
+    &isSimulationCFL,                              // isSimulation
+    PreemptionMode::MidThread,                     // defaultPreemptionMode
+    EngineType::ENGINE_RCS,                        // defaultEngineType
+    0,                                             // maxRenderFrequency
+    21,                                            // clVersionSupport
+    CmdServicesMemTraceVersion::DeviceValues::Cfl, // aubDeviceId
+    0,                                             // extraQuantityThreadsPerEU
+    64,                                            // slmSize
+    true,                                          // ftrSupportsFP64
+    true,                                          // ftrSupports64BitMath
+    true,                                          // ftrSvm
+    true,                                          // ftrSupportsCoherency
+    true,                                          // ftrSupportsVmeAvcTextureSampler
+    false,                                         // ftrSupportsVmeAvcPreemption
+    false,                                         // ftrRenderCompressedBuffers
+    false,                                         // ftrRenderCompressedImages
+    true,                                          // ftr64KBpages
+    true,                                          // instrumentationEnabled
+    true,                                          // forceStatelessCompilationFor32Bit
+    true,                                          // isCore
+    true,                                          // sourceLevelDebuggerSupported
+    true,                                          // supportsVme
+    false                                          // supportCacheFlushAfterWalker
+};

 const HardwareInfo CFL_1x2x6::hwInfo = {
    &CFL::platform,
--- a/runtime/gen9/hw_info_glk.inl
+++ b/runtime/gen9/hw_info_glk.inl
@ -31,33 +31,36 @@ const PLATFORM GLK::platform = {
    0,               // usRevId_PCH
    GTTYPE_UNDEFINED};

-const RuntimeCapabilityTable GLK::capabilityTable{0,
-                                                  52.083,
-                                                  12,
-                                                  true,
-                                                  true,
-                                                  false, // ftrSvm
-                                                  true,
-                                                  true,  // ftrSupportsVmeAvcTextureSampler
-                                                  false, // ftrSupportsVmeAvcPreemption
-                                                  false, // ftrRenderCompressedBuffers
-                                                  false, // ftrRenderCompressedImages
-                                                  PreemptionMode::MidThread,
-                                                  {true, false},
-                                                  &isSimulationGLK,
-                                                  true,
-                                                  false,                             // forceStatelessCompilationFor32Bit
-                                                  {true, 30000, false, 0, false, 0}, // KmdNotifyProperties
-                                                  false,                             // ftr64KBpages
-                                                  EngineType::ENGINE_RCS,            // defaultEngineType
-                                                  MemoryConstants::pageSize,         // requiredPreemptionSurfaceSize
-                                                  false,                             // isCore
-                                                  true,                              // sourceLevelDebuggerSupported
-                                                  CmdServicesMemTraceVersion::DeviceValues::Glk,
-                                                  0,                                 // extraQuantityThreadsPerEU
-                                                  true,                              // SupportsVme
-                                                  64,                                // slmSize
-                                                  MemoryConstants::max48BitAddress}; // gpuAddressSpace
+const RuntimeCapabilityTable GLK::capabilityTable{
+    {30000, 0, 0, true, false, false},             // kmdNotifyProperties
+    {true, false},                                 // whitelistedRegisters
+    MemoryConstants::max48BitAddress,              // gpuAddressSpace
+    52.083,                                        // defaultProfilingTimerResolution
+    MemoryConstants::pageSize,                     // requiredPreemptionSurfaceSize
+    &isSimulationGLK,                              // isSimulation
+    PreemptionMode::MidThread,                     // defaultPreemptionMode
+    EngineType::ENGINE_RCS,                        // defaultEngineType
+    0,                                             // maxRenderFrequency
+    12,                                            // clVersionSupport
+    CmdServicesMemTraceVersion::DeviceValues::Glk, // aubDeviceId
+    0,                                             // extraQuantityThreadsPerEU
+    64,                                            // slmSize
+    true,                                          // ftrSupportsFP64
+    true,                                          // ftrSupports64BitMath
+    false,                                         // ftrSvm
+    true,                                          // ftrSupportsCoherency
+    true,                                          // ftrSupportsVmeAvcTextureSampler
+    false,                                         // ftrSupportsVmeAvcPreemption
+    false,                                         // ftrRenderCompressedBuffers
+    false,                                         // ftrRenderCompressedImages
+    false,                                         // ftr64KBpages
+    true,                                          // instrumentationEnabled
+    false,                                         // forceStatelessCompilationFor32Bit
+    false,                                         // isCore
+    true,                                          // sourceLevelDebuggerSupported
+    true,                                          // supportsVme
+    false                                          // supportCacheFlushAfterWalker
+};

 const HardwareInfo GLK_1x3x6::hwInfo = {
    &GLK::platform,
--- a/runtime/gen9/hw_info_kbl.inl
+++ b/runtime/gen9/hw_info_kbl.inl
@ -31,33 +31,36 @@ const PLATFORM KBL::platform = {
    0,             // usRevId_PCH
    GTTYPE_UNDEFINED};

-const RuntimeCapabilityTable KBL::capabilityTable{0,
-                                                  83.333,
-                                                  21,
-                                                  true,
-                                                  true,
-                                                  true,
-                                                  true,
-                                                  true,  // ftrSupportsVmeAvcTextureSampler
-                                                  false, // ftrSupportsVmeAvcPreemption
-                                                  false, // ftrRenderCompressedBuffers
-                                                  false, // ftrRenderCompressedImages
-                                                  PreemptionMode::MidThread,
-                                                  {true, false},
-                                                  &isSimulationKBL,
-                                                  true,
-                                                  true,                           // forceStatelessCompilationFor32Bit
-                                                  {false, 0, false, 0, false, 0}, // KmdNotifyProperties
-                                                  true,                           // ftr64KBpages
-                                                  EngineType::ENGINE_RCS,         // defaultEngineType
-                                                  MemoryConstants::pageSize,      // requiredPreemptionSurfaceSize
-                                                  true,                           // isCore
-                                                  true,                           // sourceLevelDebuggerSupported
-                                                  CmdServicesMemTraceVersion::DeviceValues::Kbl,
-                                                  0,                                 // extraQuantityThreadsPerEU
-                                                  true,                              // SupportsVme
-                                                  64,                                // slmSize
-                                                  MemoryConstants::max48BitAddress}; // gpuAddressSpace
+const RuntimeCapabilityTable KBL::capabilityTable{
+    {0, 0, 0, false, false, false},                // kmdNotifyProperties
+    {true, false},                                 // whitelistedRegisters
+    MemoryConstants::max48BitAddress,              // gpuAddressSpace
+    83.333,                                        // defaultProfilingTimerResolution
+    MemoryConstants::pageSize,                     // requiredPreemptionSurfaceSize
+    &isSimulationKBL,                              // isSimulation
+    PreemptionMode::MidThread,                     // defaultPreemptionMode
+    EngineType::ENGINE_RCS,                        // defaultEngineType
+    0,                                             // maxRenderFrequency
+    21,                                            // clVersionSupport
+    CmdServicesMemTraceVersion::DeviceValues::Kbl, // aubDeviceId
+    0,                                             // extraQuantityThreadsPerEU
+    64,                                            // slmSize
+    true,                                          // ftrSupportsFP64
+    true,                                          // ftrSupports64BitMath
+    true,                                          // ftrSvm
+    true,                                          // ftrSupportsCoherency
+    true,                                          // ftrSupportsVmeAvcTextureSampler
+    false,                                         // ftrSupportsVmeAvcPreemption
+    false,                                         // ftrRenderCompressedBuffers
+    false,                                         // ftrRenderCompressedImages
+    true,                                          // ftr64KBpages
+    true,                                          // instrumentationEnabled
+    true,                                          // forceStatelessCompilationFor32Bit
+    true,                                          // isCore
+    true,                                          // sourceLevelDebuggerSupported
+    true,                                          // supportsVme
+    false                                          // supportCacheFlushAfterWalker
+};

 const HardwareInfo KBL_1x2x6::hwInfo = {
    &KBL::platform,
--- a/runtime/gen9/hw_info_skl.inl
+++ b/runtime/gen9/hw_info_skl.inl
@ -39,33 +39,36 @@ const PLATFORM SKL::platform = {
    0,             // usRevId_PCH
    GTTYPE_UNDEFINED};

-const RuntimeCapabilityTable SKL::capabilityTable{0,
-                                                  83.333,
-                                                  21,
-                                                  true,
-                                                  true,
-                                                  true,
-                                                  true,
-                                                  true,  // ftrSupportsVmeAvcTextureSampler
-                                                  false, // ftrSupportsVmeAvcPreemption
-                                                  false, // ftrRenderCompressedBuffers
-                                                  false, // ftrRenderCompressedImages
-                                                  PreemptionMode::MidThread,
-                                                  {true, false},
-                                                  &isSimulationSKL,
-                                                  true,
-                                                  true,                           // forceStatelessCompilationFor32Bit
-                                                  {false, 0, false, 0, false, 0}, // KmdNotifyProperties
-                                                  true,                           // ftr64KBpages
-                                                  EngineType::ENGINE_RCS,         // defaultEngineType
-                                                  MemoryConstants::pageSize,      // requiredPreemptionSurfaceSize
-                                                  true,                           // isCore
-                                                  true,                           // sourceLevelDebuggerSupported
-                                                  CmdServicesMemTraceVersion::DeviceValues::Skl,
-                                                  0,                                 // extraQuantityThreadsPerEU
-                                                  true,                              // SupportsVme
-                                                  64,                                // slmSize
-                                                  MemoryConstants::max48BitAddress}; // gpuAddressSpace
+const RuntimeCapabilityTable SKL::capabilityTable{
+    {0, 0, 0, false, false, false},                // kmdNotifyProperties
+    {true, false},                                 // whitelistedRegisters
+    MemoryConstants::max48BitAddress,              // gpuAddressSpace
+    83.333,                                        // defaultProfilingTimerResolution
+    MemoryConstants::pageSize,                     // requiredPreemptionSurfaceSize
+    &isSimulationSKL,                              // isSimulation
+    PreemptionMode::MidThread,                     // defaultPreemptionMode
+    EngineType::ENGINE_RCS,                        // defaultEngineType
+    0,                                             // maxRenderFrequency
+    21,                                            // clVersionSupport
+    CmdServicesMemTraceVersion::DeviceValues::Skl, // aubDeviceId
+    0,                                             // extraQuantityThreadsPerEU
+    64,                                            // slmSize
+    true,                                          // ftrSupportsFP64
+    true,                                          // ftrSupports64BitMath
+    true,                                          // ftrSvm
+    true,                                          // ftrSupportsCoherency
+    true,                                          // ftrSupportsVmeAvcTextureSampler
+    false,                                         // ftrSupportsVmeAvcPreemption
+    false,                                         // ftrRenderCompressedBuffers
+    false,                                         // ftrRenderCompressedImages
+    true,                                          // ftr64KBpages
+    true,                                          // instrumentationEnabled
+    true,                                          // forceStatelessCompilationFor32Bit
+    true,                                          // isCore
+    true,                                          // sourceLevelDebuggerSupported
+    true,                                          // supportsVme
+    false                                          // supportCacheFlushAfterWalker
+};

 const HardwareInfo SKL_1x2x6::hwInfo = {
    &SKL::platform,
--- a/runtime/helpers/hw_info.h
+++ b/runtime/helpers/hw_info.h
@ -32,10 +32,19 @@ struct WhitelistedRegisters {
 };

 struct RuntimeCapabilityTable {
-    uint32_t maxRenderFrequency;
+    KmdNotifyProperties kmdNotifyProperties;
+    WhitelistedRegisters whitelistedRegisters;
+    uint64_t gpuAddressSpace;
    double defaultProfilingTimerResolution;
-
+    size_t requiredPreemptionSurfaceSize;
+    bool (*isSimulation)(unsigned short);
+    PreemptionMode defaultPreemptionMode;
+    EngineType defaultEngineType;
+    uint32_t maxRenderFrequency;
    unsigned int clVersionSupport;
+    uint32_t aubDeviceId;
+    uint32_t extraQuantityThreadsPerEU;
+    uint32_t slmSize;
    bool ftrSupportsFP64;
    bool ftrSupports64BitMath;
    bool ftrSvm;
@ -44,29 +53,13 @@ struct RuntimeCapabilityTable {
    bool ftrSupportsVmeAvcPreemption;
    bool ftrRenderCompressedBuffers;
    bool ftrRenderCompressedImages;
-    PreemptionMode defaultPreemptionMode;
-    WhitelistedRegisters whitelistedRegisters;
-
-    bool (*isSimulation)(unsigned short);
-    bool instrumentationEnabled;
-
-    bool forceStatelessCompilationFor32Bit;
-
-    KmdNotifyProperties kmdNotifyProperties;
-
    bool ftr64KBpages;
-
-    EngineType defaultEngineType;
-
-    size_t requiredPreemptionSurfaceSize;
+    bool instrumentationEnabled;
+    bool forceStatelessCompilationFor32Bit;
    bool isCore;
    bool sourceLevelDebuggerSupported;
-    uint32_t aubDeviceId;
-
-    uint32_t extraQuantityThreadsPerEU;
    bool supportsVme;
-    uint32_t slmSize;
-    uint64_t gpuAddressSpace;
+    bool supportCacheFlushAfterWalker;
 };

 struct HardwareCapabilities {
--- a/runtime/helpers/kernel_commands.h
+++ b/runtime/helpers/kernel_commands.h
@ -142,7 +142,7 @@ struct KernelCommandsHelper : public PerThreadDataHelper {
        const bool &kernelUsesLocalIds,
        Kernel &kernel);

-    static size_t getSizeRequiredCS();
+    static size_t getSizeRequiredCS(const Kernel *kernel);
    static bool isPipeControlWArequired();
    static size_t getSizeRequiredDSH(
        const Kernel &kernel);
@ -202,6 +202,7 @@ struct KernelCommandsHelper : public PerThreadDataHelper {

    static void programMiSemaphoreWait(LinearStream &commandStream, uint64_t compareAddress, uint32_t compareData);
    static MI_ATOMIC *programMiAtomic(LinearStream &commandStream, uint64_t writeAddress, typename MI_ATOMIC::ATOMIC_OPCODES opcode, typename MI_ATOMIC::DATA_SIZE dataSize);
+    static void programCacheFlushAfterWalkerCommand(LinearStream *commandStream, const Kernel *kernel);

    static const size_t alignInterfaceDescriptorData = 64 * sizeof(uint8_t);
    static const uint32_t alignIndirectStatePointer = 64 * sizeof(uint8_t);
--- a/runtime/helpers/kernel_commands_base.inl
+++ b/runtime/helpers/kernel_commands_base.inl
@ -7,6 +7,7 @@

 #pragma once
 #include "runtime/helpers/kernel_commands.h"
+#include "runtime/kernel/kernel.h"

 namespace OCLRT {

@ -43,9 +44,13 @@ uint32_t KernelCommandsHelper<GfxFamily>::additionalSizeRequiredDsh() {
 }

 template <typename GfxFamily>
-size_t KernelCommandsHelper<GfxFamily>::getSizeRequiredCS() {
-    return 2 * sizeof(typename GfxFamily::MEDIA_STATE_FLUSH) +
-           sizeof(typename GfxFamily::MEDIA_INTERFACE_DESCRIPTOR_LOAD);
+size_t KernelCommandsHelper<GfxFamily>::getSizeRequiredCS(const Kernel *kernel) {
+    size_t size = 2 * sizeof(typename GfxFamily::MEDIA_STATE_FLUSH) +
+                  sizeof(typename GfxFamily::MEDIA_INTERFACE_DESCRIPTOR_LOAD);
+    if (kernel->requiresCacheFlushCommand()) {
+        size += sizeof(typename GfxFamily::PIPE_CONTROL);
+    }
+    return size;
 }

 template <typename GfxFamily>
@ -155,4 +160,14 @@ bool KernelCommandsHelper<GfxFamily>::isRuntimeLocalIdsGenerationRequired(uint32
    return true;
 }

+template <typename GfxFamily>
+void KernelCommandsHelper<GfxFamily>::programCacheFlushAfterWalkerCommand(LinearStream *commandStream, const Kernel *kernel) {
+    if (kernel->requiresCacheFlushCommand()) {
+        using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
+        auto pipeControl = reinterpret_cast<PIPE_CONTROL *>(commandStream->getSpace(sizeof(PIPE_CONTROL)));
+        *pipeControl = GfxFamily::cmdInitPipeControl;
+        pipeControl->setCommandStreamerStallEnable(true);
+        pipeControl->setDcFlushEnable(true);
+    }
+}
 } // namespace OCLRT
--- a/runtime/helpers/kmd_notify_properties.h
+++ b/runtime/helpers/kmd_notify_properties.h
@ -14,15 +14,15 @@

 namespace OCLRT {
 struct KmdNotifyProperties {
+    int64_t delayKmdNotifyMicroseconds;
+    int64_t delayQuickKmdSleepMicroseconds;
+    int64_t delayQuickKmdSleepForSporadicWaitsMicroseconds;
    // Main switch for KMD Notify optimization - if its disabled, all below are disabled too
    bool enableKmdNotify;
-    int64_t delayKmdNotifyMicroseconds;
    // Use smaller delay in specific situations (ie. from AsyncEventsHandler)
    bool enableQuickKmdSleep;
-    int64_t delayQuickKmdSleepMicroseconds;
    // If waits are called sporadically  use QuickKmdSleep mode, otherwise use standard delay
    bool enableQuickKmdSleepForSporadicWaits;
-    int64_t delayQuickKmdSleepForSporadicWaitsMicroseconds;
 };

 namespace KmdNotifyConstants {
--- a/runtime/kernel/kernel.cpp
+++ b/runtime/kernel/kernel.cpp
@ -309,6 +309,7 @@ cl_int Kernel::initialize() {
        kernelArguments.resize(numArgs);
        slmSizes.resize(numArgs);
        kernelArgHandlers.resize(numArgs);
+        kernelArgRequiresCacheFlush.resize(numArgs);

        for (uint32_t i = 0; i < numArgs; ++i) {
            storeKernelArg(i, NONE_OBJ, nullptr, nullptr, 0);
@ -849,6 +850,8 @@ cl_int Kernel::setArgSvm(uint32_t argIndex, size_t svmAllocSize, void *svmPtr, G
        patchedArgumentsNum++;
        kernelArguments[argIndex].isPatched = true;
    }
+    addAllocationToCacheFlushVector(argIndex, svmAlloc);
+
    return CL_SUCCESS;
 }

@ -884,6 +887,9 @@ cl_int Kernel::setArgSvmAlloc(uint32_t argIndex, void *svmPtr, GraphicsAllocatio
        patchedArgumentsNum++;
        kernelArguments[argIndex].isPatched = true;
    }
+
+    addAllocationToCacheFlushVector(argIndex, svmAlloc);
+
    return CL_SUCCESS;
 }

@ -908,10 +914,14 @@ const Kernel::SimpleKernelArgInfo &Kernel::getKernelArgInfo(uint32_t argIndex) c

 void Kernel::setKernelExecInfo(GraphicsAllocation *argValue) {
    kernelSvmGfxAllocations.push_back(argValue);
+    if (allocationForCacheFlush(argValue)) {
+        svmAllocationsRequireCacheFlush = true;
+    }
 }

 void Kernel::clearKernelExecInfo() {
    kernelSvmGfxAllocations.clear();
+    svmAllocationsRequireCacheFlush = false;
 }

 inline void Kernel::makeArgsResident(CommandStreamReceiver &commandStreamReceiver) {
@ -1119,7 +1129,7 @@ cl_int Kernel::setArgBuffer(uint32_t argIndex,
            auto surfaceState = ptrOffset(getSurfaceStateHeap(), kernelArgInfo.offsetHeap);
            buffer->setArgStateful(surfaceState, forceNonAuxMode);
        }
-
+        addAllocationToCacheFlushVector(argIndex, buffer->getGraphicsAllocation());
        return CL_SUCCESS;
    } else {

@ -1243,7 +1253,7 @@ cl_int Kernel::setArgImageWithMipLevel(uint32_t argIndex,
        patch<uint32_t, cl_channel_order>(imageFormat.image_channel_order, crossThreadData, kernelArgInfo.offsetChannelOrder);
        patch<uint32_t, uint32_t>(kernelArgInfo.offsetHeap, crossThreadData, kernelArgInfo.offsetObjectId);
        patch<uint32_t, cl_uint>(imageDesc.num_mip_levels, crossThreadData, kernelArgInfo.offsetNumMipLevels);
-
+        addAllocationToCacheFlushVector(argIndex, pImage->getGraphicsAllocation());
        retVal = CL_SUCCESS;
    }

@ -2122,4 +2132,51 @@ void Kernel::fillWithBuffersForAuxTranslation(MemObjsForAuxTranslation &memObjsF
        }
    }
 }
+
+bool Kernel::platformSupportCacheFlushAfterWalker() const {
+    int32_t dbgFlag = DebugManager.flags.EnableCacheFlushAfterWalker.get();
+    if (dbgFlag == 1) {
+        return true;
+    } else if (dbgFlag == 0) {
+        return false;
+    }
+    return device.getHardwareInfo().capabilityTable.supportCacheFlushAfterWalker;
+}
+
+bool Kernel::requiresCacheFlushCommand() const {
+    if (platformSupportCacheFlushAfterWalker()) {
+        if (getProgram()->getGlobalSurface() != nullptr) {
+            return true;
+        }
+        if (svmAllocationsRequireCacheFlush) {
+            return true;
+        }
+        size_t args = kernelArgRequiresCacheFlush.size();
+        for (size_t i = 0; i < args; i++) {
+            if (kernelArgRequiresCacheFlush[i] != nullptr) {
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+bool Kernel::allocationForCacheFlush(GraphicsAllocation *argAllocation) {
+    if (argAllocation->flushL3Required || argAllocation->isMemObjectsAllocationWithWritableFlags()) {
+        return true;
+    }
+    return false;
+}
+
+void Kernel::addAllocationToCacheFlushVector(uint32_t argIndex, GraphicsAllocation *argAllocation) {
+    if (argAllocation == nullptr) {
+        kernelArgRequiresCacheFlush[argIndex] = nullptr;
+    } else {
+        if (allocationForCacheFlush(argAllocation)) {
+            kernelArgRequiresCacheFlush[argIndex] = argAllocation;
+        } else {
+            kernelArgRequiresCacheFlush[argIndex] = nullptr;
+        }
+    }
+}
 } // namespace OCLRT
--- a/runtime/kernel/kernel.h
+++ b/runtime/kernel/kernel.h
@ -374,6 +374,8 @@ class Kernel : public BaseObject<_cl_kernel> {

    void fillWithBuffersForAuxTranslation(MemObjsForAuxTranslation &buffersForAuxTranslation);

+    bool requiresCacheFlushCommand() const;
+
  protected:
    struct ObjectCounts {
        uint32_t imageCount;
@ -461,6 +463,9 @@ class Kernel : public BaseObject<_cl_kernel> {

    void reconfigureKernel();

+    bool platformSupportCacheFlushAfterWalker() const;
+    void addAllocationToCacheFlushVector(uint32_t argIndex, GraphicsAllocation *argAllocation);
+    bool allocationForCacheFlush(GraphicsAllocation *argAllocation);
    Program *program;
    Context *context;
    const Device &device;
@ -493,5 +498,7 @@ class Kernel : public BaseObject<_cl_kernel> {
    std::unique_ptr<ImageTransformer> imageTransformer;

    bool specialPipelineSelectMode = false;
+    bool svmAllocationsRequireCacheFlush = false;
+    std::vector<GraphicsAllocation *> kernelArgRequiresCacheFlush;
 };
 } // namespace OCLRT
--- a/runtime/memory_manager/svm_memory_manager.cpp
+++ b/runtime/memory_manager/svm_memory_manager.cpp
@ -1,23 +1,8 @@
 /*
- * Copyright (c) 2017, Intel Corporation
+ * Copyright (C) 2017-2018 Intel Corporation
 *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
+ * SPDX-License-Identifier: MIT
 *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
 */

 #include "runtime/memory_manager/memory_manager.h"
@ -64,7 +49,7 @@ GraphicsAllocation *SVMAllocsManager::MapBasedAllocationTracker::get(const void
 SVMAllocsManager::SVMAllocsManager(MemoryManager *memoryManager) : memoryManager(memoryManager) {
 }

-void *SVMAllocsManager::createSVMAlloc(size_t size, bool coherent) {
+void *SVMAllocsManager::createSVMAlloc(size_t size, bool coherent, bool readOnly) {
    if (size == 0)
        return nullptr;

@ -73,6 +58,7 @@ void *SVMAllocsManager::createSVMAlloc(size_t size, bool coherent) {
    if (!GA) {
        return nullptr;
    }
+    GA->setMemObjectsAllocationWithWritableFlags(!readOnly);
    this->SVMAllocs.insert(*GA);

    return GA->getUnderlyingBuffer();
@ -91,4 +77,8 @@ void SVMAllocsManager::freeSVMAlloc(void *ptr) {
        memoryManager->freeGraphicsMemory(GA);
    }
 }
+
+bool SVMAllocsManager::memFlagIsReadOnly(cl_svm_mem_flags flags) {
+    return (flags & (CL_MEM_READ_ONLY | CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)) != 0;
+}
 } // namespace OCLRT
--- a/runtime/memory_manager/svm_memory_manager.h
+++ b/runtime/memory_manager/svm_memory_manager.h
@ -1,29 +1,15 @@
 /*
- * Copyright (c) 2017, Intel Corporation
+ * Copyright (C) 2017-2018 Intel Corporation
 *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
+ * SPDX-License-Identifier: MIT
 *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
 */

 #pragma once
 #include <cstdint>
 #include <map>
 #include <mutex>
+#include "CL/cl.h"

 namespace OCLRT {
 class Device;
@ -45,10 +31,11 @@ class SVMAllocsManager {
    };

    SVMAllocsManager(MemoryManager *memoryManager);
-    void *createSVMAlloc(size_t size, bool coherent = false);
+    void *createSVMAlloc(size_t size, bool coherent, bool readOnly);
    GraphicsAllocation *getSVMAlloc(const void *ptr);
    void freeSVMAlloc(void *ptr);
    size_t getNumAllocs() const { return SVMAllocs.getNumAllocs(); }
+    static bool memFlagIsReadOnly(cl_svm_mem_flags flags);

  protected:
    MapBasedAllocationTracker SVMAllocs;
--- a/runtime/os_interface/debug_variables_base.inl
+++ b/runtime/os_interface/debug_variables_base.inl
@ -91,6 +91,7 @@ DECLARE_DEBUG_VARIABLE(bool, EnableComputeWorkSizeSquared, false, "Enables algor
 DECLARE_DEBUG_VARIABLE(bool, EnableVaLibCalls, true, "Enable cl-va sharing lib calls")
 DECLARE_DEBUG_VARIABLE(bool, AddClGlSharing, false, "Add cl-gl extension")
 DECLARE_DEBUG_VARIABLE(bool, EnablePassInlineData, false, "Enable passing of inline data")
+DECLARE_DEBUG_VARIABLE(int32_t, EnableCacheFlushAfterWalker, 0, "-1: platform behavior, 0: disabled, 1: enabled. Adds dedicated cache flush command after WALKER command when surfaces used by kernel require to flush the cache")
 DECLARE_DEBUG_VARIABLE(int32_t, EnableLocalMemory, -1, "-1: default behavior, 0: disabled, 1: enabled, Allows allocating graphics memory in Local Memory")
 DECLARE_DEBUG_VARIABLE(int32_t, EnableStatelessToStatefulBufferOffsetOpt, -1, "-1: dont override, 0: disable, 1: enable, Enables buffer-offset improvement of the stateless to stateful optimization")
 DECLARE_DEBUG_VARIABLE(int32_t, CreateMultipleDevices, 0, "0: default - disable, 1+: Driver will create multiple (N) devices during initialization.")
--- a/unit_tests/api/cl_set_kernel_exec_info_tests.inl
+++ b/unit_tests/api/cl_set_kernel_exec_info_tests.inl
@ -173,6 +173,34 @@ TEST_F(clSetKernelExecInfoTests, success_SvmPtrListWithMultiplePointers) {
        EXPECT_EQ(CL_SUCCESS, retVal);

        EXPECT_EQ(3u, pMockKernel->getKernelSvmGfxAllocations().size());
+        EXPECT_TRUE(pMockKernel->svmAllocationsRequireCacheFlush);
+
+        clSVMFree(pContext, ptrSvm1);
+        clSVMFree(pContext, ptrSvm2);
+    }
+}
+
+TEST_F(clSetKernelExecInfoTests, givenReadOnlySvmPtrListWhenUsedAsKernelPointersThenNoCacheFlushRequire) {
+    if (svmCapabilities != 0) {
+        void *ptrSvm1 = clSVMAlloc(pContext, CL_MEM_READ_ONLY, 256, 4);
+        EXPECT_NE(nullptr, ptrSvm1);
+
+        void *ptrSvm2 = clSVMAlloc(pContext, CL_MEM_READ_ONLY, 256, 4);
+        EXPECT_NE(nullptr, ptrSvm2);
+
+        void *pSvmPtrList[] = {ptrSvm1, ptrSvm2};
+        size_t SvmPtrListSizeInBytes = 2 * sizeof(void *);
+
+        retVal = clSetKernelExecInfo(
+            pMockKernel,                  // cl_kernel kernel
+            CL_KERNEL_EXEC_INFO_SVM_PTRS, // cl_kernel_exec_info param_name
+            SvmPtrListSizeInBytes,        // size_t param_value_size
+            pSvmPtrList                   // const void *param_value
+        );
+        EXPECT_EQ(CL_SUCCESS, retVal);
+
+        EXPECT_EQ(2u, pMockKernel->getKernelSvmGfxAllocations().size());
+        EXPECT_FALSE(pMockKernel->svmAllocationsRequireCacheFlush);

        clSVMFree(pContext, ptrSvm1);
        clSVMFree(pContext, ptrSvm2);
--- a/unit_tests/command_queue/dispatch_walker_tests.cpp
+++ b/unit_tests/command_queue/dispatch_walker_tests.cpp
@ -78,11 +78,11 @@ struct DispatchWalkerTest : public CommandQueueFixture, public DeviceFixture, pu

    std::unique_ptr<MockProgram> program;

-    SKernelBinaryHeaderCommon kernelHeader;
-    SPatchDataParameterStream dataParameterStream;
-    SPatchExecutionEnvironment executionEnvironment;
-    SPatchThreadPayload threadPayload;
-    SPatchSamplerStateArray samplerArray;
+    SKernelBinaryHeaderCommon kernelHeader = {};
+    SPatchDataParameterStream dataParameterStream = {};
+    SPatchExecutionEnvironment executionEnvironment = {};
+    SPatchThreadPayload threadPayload = {};
+    SPatchSamplerStateArray samplerArray = {};

    KernelInfo kernelInfo;
    KernelInfo kernelInfoWithSampler;
@ -111,7 +111,7 @@ HWTEST_F(DispatchWalkerTest, shouldntChangeCommandStreamMemory) {

    // Consume all memory except what is needed for this enqueue
    auto sizeDispatchWalkerNeeds = sizeof(typename FamilyType::WALKER_TYPE) +
-                                   KernelCommandsHelper<FamilyType>::getSizeRequiredCS();
+                                   KernelCommandsHelper<FamilyType>::getSizeRequiredCS(&kernel);

    //cs has a minimum required size
    auto sizeThatNeedsToBeSubstracted = sizeDispatchWalkerNeeds + CSRequirements::minCommandQueueCommandStreamSize;
@ -160,7 +160,7 @@ HWTEST_F(DispatchWalkerTest, noLocalIdsShouldntCrash) {

    // Consume all memory except what is needed for this enqueue
    auto sizeDispatchWalkerNeeds = sizeof(typename FamilyType::WALKER_TYPE) +
-                                   KernelCommandsHelper<FamilyType>::getSizeRequiredCS();
+                                   KernelCommandsHelper<FamilyType>::getSizeRequiredCS(&kernel);

    //cs has a minimum required size
    auto sizeThatNeedsToBeSubstracted = sizeDispatchWalkerNeeds + CSRequirements::minCommandQueueCommandStreamSize;
--- a/unit_tests/command_queue/enqueue_kernel_2_tests.cpp
+++ b/unit_tests/command_queue/enqueue_kernel_2_tests.cpp
@ -10,7 +10,9 @@
 #include "runtime/memory_manager/allocations_list.h"
 #include "unit_tests/command_queue/enqueue_fixture.h"
 #include "unit_tests/fixtures/hello_world_fixture.h"
+#include "unit_tests/gen_common/gen_cmd_parse.h"
 #include "unit_tests/gen_common/gen_commands_common_validation.h"
+#include "unit_tests/helpers/debug_manager_state_restore.h"
 #include "unit_tests/helpers/hw_parse.h"
 #include "unit_tests/mocks/mock_csr.h"
 #include "unit_tests/mocks/mock_command_queue.h"
@ -843,3 +845,30 @@ HWCMDTEST_F(IGFX_GEN8_CORE, EnqueueAuxKernelTests, givenParentKernelWhenAuxTrans
        EXPECT_EQ(1u, cmdQ.waitCalled);
    }
 }
+
+HWCMDTEST_F(IGFX_GEN8_CORE, EnqueueKernelTest, givenCacheFlushAfterWalkerEnabledWhenAllocationRequiresCacheFlushThenFlushCommandPresentAfterWalker) {
+    using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER;
+    using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
+
+    DebugManagerStateRestore dbgRestore;
+    DebugManager.flags.EnableCacheFlushAfterWalker.set(1);
+
+    MockKernelWithInternals mockKernel(*pDevice, context);
+    CommandQueueHw<FamilyType> cmdQ(context, pDevice, nullptr);
+
+    size_t gws[3] = {1, 0, 0};
+
+    mockKernel.mockKernel->svmAllocationsRequireCacheFlush = true;
+
+    cmdQ.enqueueKernel(mockKernel.mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr);
+
+    HardwareParse hwParse;
+    hwParse.parseCommands<FamilyType>(cmdQ.getCS(0), 0);
+    auto itorCmd = find<GPGPU_WALKER *>(hwParse.cmdList.begin(), hwParse.cmdList.end());
+    ASSERT_NE(hwParse.cmdList.end(), itorCmd);
+    ++itorCmd;
+    auto pipeControl = genCmdCast<PIPE_CONTROL *>(*itorCmd);
+    ASSERT_NE(nullptr, pipeControl);
+    EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable());
+    EXPECT_TRUE(pipeControl->getDcFlushEnable());
+}
--- a/unit_tests/command_queue/enqueue_svm_mem_copy_tests.cpp
+++ b/unit_tests/command_queue/enqueue_svm_mem_copy_tests.cpp
@ -25,9 +25,9 @@ struct EnqueueSvmMemCopyTest : public DeviceFixture,
    void SetUp() override {
        DeviceFixture::SetUp();
        CommandQueueFixture::SetUp(pDevice, 0);
-        srcSvmPtr = context->getSVMAllocsManager()->createSVMAlloc(256);
+        srcSvmPtr = context->getSVMAllocsManager()->createSVMAlloc(256, false, false);
        ASSERT_NE(nullptr, srcSvmPtr);
-        dstSvmPtr = context->getSVMAllocsManager()->createSVMAlloc(256);
+        dstSvmPtr = context->getSVMAllocsManager()->createSVMAlloc(256, false, false);
        ASSERT_NE(nullptr, dstSvmPtr);
        srcSvmAlloc = context->getSVMAllocsManager()->getSVMAlloc(srcSvmPtr);
        ASSERT_NE(nullptr, srcSvmAlloc);
--- a/unit_tests/command_queue/enqueue_svm_mem_fill_tests.cpp
+++ b/unit_tests/command_queue/enqueue_svm_mem_fill_tests.cpp
@ -27,7 +27,7 @@ struct EnqueueSvmMemFillTest : public DeviceFixture,
        CommandQueueFixture::SetUp(pDevice, 0);
        patternSize = (size_t)GetParam();
        ASSERT_TRUE((0 < patternSize) && (patternSize <= 128));
-        svmPtr = context->getSVMAllocsManager()->createSVMAlloc(256, true);
+        svmPtr = context->getSVMAllocsManager()->createSVMAlloc(256, true, false);
        ASSERT_NE(nullptr, svmPtr);
        svmAlloc = context->getSVMAllocsManager()->getSVMAlloc(svmPtr);
        ASSERT_NE(nullptr, svmAlloc);
--- a/unit_tests/command_queue/enqueue_svm_tests.cpp
+++ b/unit_tests/command_queue/enqueue_svm_tests.cpp
@ -33,7 +33,7 @@ struct EnqueueSvmTest : public DeviceFixture,
    void SetUp() override {
        DeviceFixture::SetUp();
        CommandQueueFixture::SetUp(pDevice, 0);
-        ptrSVM = context->getSVMAllocsManager()->createSVMAlloc(256);
+        ptrSVM = context->getSVMAllocsManager()->createSVMAlloc(256, false, false);
    }

    void TearDown() override {
@ -238,7 +238,7 @@ TEST_F(EnqueueSvmTest, enqueueSVMMemcpy_InvalidValueDstPtrIsNull) {
    DebugManagerStateRestore dbgRestore;
    DebugManager.flags.EnableAsyncEventsHandler.set(false);
    void *pDstSVM = nullptr;
-    void *pSrcSVM = context->getSVMAllocsManager()->createSVMAlloc(256);
+    void *pSrcSVM = context->getSVMAllocsManager()->createSVMAlloc(256, false, false);
    retVal = this->pCmdQ->enqueueSVMMemcpy(
        false,   // cl_bool  blocking_copy
        pDstSVM, // void *dst_ptr
@ -269,7 +269,7 @@ TEST_F(EnqueueSvmTest, enqueueSVMMemcpy_InvalidValueSrcPtrIsNull) {

 TEST_F(EnqueueSvmTest, enqueueSVMMemcpy_Success) {
    void *pDstSVM = ptrSVM;
-    void *pSrcSVM = context->getSVMAllocsManager()->createSVMAlloc(256);
+    void *pSrcSVM = context->getSVMAllocsManager()->createSVMAlloc(256, false, false);
    retVal = this->pCmdQ->enqueueSVMMemcpy(
        false,   // cl_bool  blocking_copy
        pDstSVM, // void *dst_ptr
@ -285,7 +285,7 @@ TEST_F(EnqueueSvmTest, enqueueSVMMemcpy_Success) {

 TEST_F(EnqueueSvmTest, enqueueSVMMemcpyBlocking_Success) {
    void *pDstSVM = ptrSVM;
-    void *pSrcSVM = context->getSVMAllocsManager()->createSVMAlloc(256);
+    void *pSrcSVM = context->getSVMAllocsManager()->createSVMAlloc(256, false, false);
    retVal = this->pCmdQ->enqueueSVMMemcpy(
        true,    // cl_bool  blocking_copy
        pDstSVM, // void *dst_ptr
@ -301,7 +301,7 @@ TEST_F(EnqueueSvmTest, enqueueSVMMemcpyBlocking_Success) {

 TEST_F(EnqueueSvmTest, enqueueSVMMemcpyBlockedOnEvent_Success) {
    void *pDstSVM = ptrSVM;
-    void *pSrcSVM = context->getSVMAllocsManager()->createSVMAlloc(256);
+    void *pSrcSVM = context->getSVMAllocsManager()->createSVMAlloc(256, false, false);
    UserEvent uEvent;
    cl_event eventWaitList[] = {&uEvent};
    retVal = this->pCmdQ->enqueueSVMMemcpy(
@ -319,7 +319,7 @@ TEST_F(EnqueueSvmTest, enqueueSVMMemcpyBlockedOnEvent_Success) {

 TEST_F(EnqueueSvmTest, enqueueSVMMemcpyCoherent_Success) {
    void *pDstSVM = ptrSVM;
-    void *pSrcSVM = context->getSVMAllocsManager()->createSVMAlloc(256, true);
+    void *pSrcSVM = context->getSVMAllocsManager()->createSVMAlloc(256, true, false);
    retVal = this->pCmdQ->enqueueSVMMemcpy(
        false,   // cl_bool  blocking_copy
        pDstSVM, // void *dst_ptr
@ -335,7 +335,7 @@ TEST_F(EnqueueSvmTest, enqueueSVMMemcpyCoherent_Success) {

 TEST_F(EnqueueSvmTest, enqueueSVMMemcpyCoherentBlockedOnEvent_Success) {
    void *pDstSVM = ptrSVM;
-    void *pSrcSVM = context->getSVMAllocsManager()->createSVMAlloc(256, true);
+    void *pSrcSVM = context->getSVMAllocsManager()->createSVMAlloc(256, true, false);
    UserEvent uEvent;
    cl_event eventWaitList[] = {&uEvent};
    retVal = this->pCmdQ->enqueueSVMMemcpy(
@ -522,7 +522,7 @@ TEST_F(EnqueueSvmTest, concurentMapAccess) {

    auto allocSvm = [&](uint32_t from, uint32_t to) {
        for (uint32_t i = from; i <= to; i++) {
-            svmPtrs[i] = context->getSVMAllocsManager()->createSVMAlloc(1);
+            svmPtrs[i] = context->getSVMAllocsManager()->createSVMAlloc(1, false, false);
            auto ga = context->getSVMAllocsManager()->getSVMAlloc(svmPtrs[i]);
            EXPECT_NE(nullptr, ga);
            EXPECT_EQ(ga->getUnderlyingBuffer(), svmPtrs[i]);
--- a/unit_tests/command_queue/zero_size_enqueue_tests.cpp
+++ b/unit_tests/command_queue/zero_size_enqueue_tests.cpp
@ -757,8 +757,8 @@ HWTEST_F(ZeroSizeEnqueueHandlerTest, enqueueFillImageWhenZeroSizeEnqueueIsDetect
 HWTEST_F(ZeroSizeEnqueueHandlerTest, enqueueSVMMemcpyWhenZeroSizeEnqueueIsDetectedThenCommandMarkerShouldBeEnqueued) {
    auto mockCmdQ = std::unique_ptr<MockCommandQueueHw<FamilyType>>(new MockCommandQueueHw<FamilyType>(&context, pDevice, 0));

-    void *pSrcSVM = context.getSVMAllocsManager()->createSVMAlloc(256);
-    void *pDstSVM = context.getSVMAllocsManager()->createSVMAlloc(256);
+    void *pSrcSVM = context.getSVMAllocsManager()->createSVMAlloc(256, false, false);
+    void *pDstSVM = context.getSVMAllocsManager()->createSVMAlloc(256, false, false);
    size_t zeroSize = 0;
    mockCmdQ->enqueueSVMMemcpy(false, pSrcSVM, pDstSVM, zeroSize, 0, nullptr, nullptr);
    EXPECT_EQ(static_cast<cl_command_type>(CL_COMMAND_MARKER), mockCmdQ->lastCommandType);
@ -771,8 +771,8 @@ HWTEST_F(ZeroSizeEnqueueHandlerTest, enqueueSVMMemcpyWhenZeroSizeEnqueueIsDetect
    auto mockCmdQ = std::unique_ptr<MockCommandQueueHw<FamilyType>>(new MockCommandQueueHw<FamilyType>(&context, pDevice, 0));

    cl_event event;
-    void *pSrcSVM = context.getSVMAllocsManager()->createSVMAlloc(256);
-    void *pDstSVM = context.getSVMAllocsManager()->createSVMAlloc(256);
+    void *pSrcSVM = context.getSVMAllocsManager()->createSVMAlloc(256, false, false);
+    void *pDstSVM = context.getSVMAllocsManager()->createSVMAlloc(256, false, false);
    size_t zeroSize = 0;
    mockCmdQ->enqueueSVMMemcpy(false, pSrcSVM, pDstSVM, zeroSize, 0, nullptr, &event);
    EXPECT_EQ(static_cast<cl_command_type>(CL_COMMAND_MARKER), mockCmdQ->lastCommandType);
@ -793,7 +793,7 @@ HWTEST_F(ZeroSizeEnqueueHandlerTest, enqueueSVMMemcpyWhenZeroSizeEnqueueIsDetect
 HWTEST_F(ZeroSizeEnqueueHandlerTest, enqueueSVMMemFillWhenZeroSizeEnqueueIsDetectedThenCommandMarkerShouldBeEnqueued) {
    auto mockCmdQ = std::unique_ptr<MockCommandQueueHw<FamilyType>>(new MockCommandQueueHw<FamilyType>(&context, pDevice, 0));

-    void *pSVM = context.getSVMAllocsManager()->createSVMAlloc(256);
+    void *pSVM = context.getSVMAllocsManager()->createSVMAlloc(256, false, false);
    const float pattern[1] = {1.2345f};
    size_t zeroSize = 0;
    mockCmdQ->enqueueSVMMemFill(pSVM, &pattern, sizeof(pattern), zeroSize, 0, nullptr, nullptr);
@ -806,7 +806,7 @@ HWTEST_F(ZeroSizeEnqueueHandlerTest, enqueueSVMMemFillWhenZeroSizeEnqueueIsDetec
    auto mockCmdQ = std::unique_ptr<MockCommandQueueHw<FamilyType>>(new MockCommandQueueHw<FamilyType>(&context, pDevice, 0));

    cl_event event;
-    void *pSVM = context.getSVMAllocsManager()->createSVMAlloc(256);
+    void *pSVM = context.getSVMAllocsManager()->createSVMAlloc(256, false, false);
    const float pattern[1] = {1.2345f};
    size_t zeroSize = 0;
    mockCmdQ->enqueueSVMMemFill(pSVM, &pattern, sizeof(pattern), zeroSize, 0, nullptr, &event);
--- a/unit_tests/context/driver_diagnostics_enqueue_tests.cpp
+++ b/unit_tests/context/driver_diagnostics_enqueue_tests.cpp
@ -648,7 +648,7 @@ TEST_P(PerformanceHintEnqueueMapTest, GivenZeroCopyFlagWhenEnqueueUnmapIsCalling

 TEST_F(PerformanceHintEnqueueTest, GivenSVMPointerWhenEnqueueSVMMapIsCallingThenContextProvidesProperHint) {

-    void *svmPtr = context->getSVMAllocsManager()->createSVMAlloc(256);
+    void *svmPtr = context->getSVMAllocsManager()->createSVMAlloc(256, false, false);

    pCmdQ->enqueueSVMMap(CL_FALSE, 0, svmPtr, 256, 0, nullptr, nullptr);

--- a/unit_tests/fixtures/device_fixture.cpp
+++ b/unit_tests/fixtures/device_fixture.cpp
@ -10,7 +10,8 @@

 namespace OCLRT {
 void DeviceFixture::SetUp() {
-    SetUpImpl(nullptr);
+    hwInfoHelper = *platformDevices[0];
+    SetUpImpl(&hwInfoHelper);
 }

 void DeviceFixture::SetUpImpl(const OCLRT::HardwareInfo *hardwareInfo) {
--- a/unit_tests/gen8/test_device_caps_gen8.cpp
+++ b/unit_tests/gen8/test_device_caps_gen8.cpp
@ -46,3 +46,7 @@ GEN8TEST_F(Gen8DeviceCaps, image3DDimensions) {
 GEN8TEST_F(Gen8DeviceCaps, givenHwInfoWhenSlmSizeIsRequiredThenReturnCorrectValue) {
    EXPECT_EQ(64u, pDevice->getHardwareInfo().capabilityTable.slmSize);
 }
+
+GEN8TEST_F(Gen8DeviceCaps, givenGen8WhenCheckSupportCacheFlushAfterWalkerThenFalse) {
+    EXPECT_FALSE(pDevice->getHardwareInfo().capabilityTable.supportCacheFlushAfterWalker);
+}
--- a/unit_tests/gen9/test_device_caps_gen9.cpp
+++ b/unit_tests/gen9/test_device_caps_gen9.cpp
@ -59,3 +59,7 @@ GEN9TEST_F(Gen9DeviceCaps, givenHwInfoWhenRequestedComputeUnitsUsedForScratchThe
 GEN9TEST_F(Gen9DeviceCaps, givenHwInfoWhenSlmSizeIsRequiredThenReturnCorrectValue) {
    EXPECT_EQ(64u, pDevice->getHardwareInfo().capabilityTable.slmSize);
 }
+
+GEN9TEST_F(Gen9DeviceCaps, givenGen9WhenCheckSupportCacheFlushAfterWalkerThenFalse) {
+    EXPECT_FALSE(pDevice->getHardwareInfo().capabilityTable.supportCacheFlushAfterWalker);
+}
--- a/unit_tests/helpers/CMakeLists.txt
+++ b/unit_tests/helpers/CMakeLists.txt
@ -30,6 +30,7 @@ set(IGDRCL_SRCS_tests_helpers
  ${CMAKE_CURRENT_SOURCE_DIR}/hw_parse.h
  ${CMAKE_CURRENT_SOURCE_DIR}/hw_parse.inl
  ${CMAKE_CURRENT_SOURCE_DIR}/kernel_commands_tests.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/kernel_commands_tests.h
  ${CMAKE_CURRENT_SOURCE_DIR}/kernel_filename_helper.h
  ${CMAKE_CURRENT_SOURCE_DIR}/kmd_notify_tests.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/memory_management_tests.cpp
--- a/unit_tests/helpers/kernel_commands_tests.cpp
+++ b/unit_tests/helpers/kernel_commands_tests.cpp
@ -5,56 +5,52 @@
 *
 */

-#include "runtime/built_ins/built_ins.h"
 #include "runtime/built_ins/builtins_dispatch_builder.h"
 #include "hw_cmds.h"
 #include "runtime/command_queue/command_queue_hw.h"
 #include "runtime/helpers/basic_math.h"
 #include "runtime/helpers/kernel_commands.h"
-#include "runtime/kernel/kernel.h"
-#include "unit_tests/fixtures/context_fixture.h"
-#include "unit_tests/fixtures/device_fixture.h"
-#include "unit_tests/fixtures/image_fixture.h"
+#include "runtime/memory_manager/svm_memory_manager.h"
 #include "unit_tests/fixtures/execution_model_kernel_fixture.h"
+#include "unit_tests/fixtures/image_fixture.h"
 #include "unit_tests/helpers/debug_manager_state_restore.h"
+#include "unit_tests/helpers/hw_parse.h"
+#include "unit_tests/helpers/kernel_commands_tests.h"
 #include "unit_tests/indirect_heap/indirect_heap_fixture.h"
-#include "unit_tests/fixtures/built_in_fixture.h"
-#include "unit_tests/mocks/mock_kernel.h"
-#include "unit_tests/mocks/mock_program.h"
-#include "unit_tests/mocks/mock_context.h"
-#include "test.h"
-
-#include <memory>
+#include "unit_tests/mocks/mock_graphics_allocation.h"

 using namespace OCLRT;

-struct KernelCommandsTest : DeviceFixture,
-                            ContextFixture,
-                            BuiltInFixture,
-                            ::testing::Test {
+void KernelCommandsTest::SetUp() {
+    DeviceFixture::SetUp();
+    ASSERT_NE(nullptr, pDevice);
+    cl_device_id device = pDevice;
+    ContextFixture::SetUp(1, &device);
+    ASSERT_NE(nullptr, pContext);
+    BuiltInFixture::SetUp(pDevice);
+    ASSERT_NE(nullptr, pBuiltIns);

-    using BuiltInFixture::SetUp;
-    using ContextFixture::SetUp;
+    mockKernelWithInternal = std::make_unique<MockKernelWithInternals>(*pDevice, pContext);
+}

-    void SetUp() override {
-        DeviceFixture::SetUp();
-        ASSERT_NE(nullptr, pDevice);
-        cl_device_id device = pDevice;
-        ContextFixture::SetUp(1, &device);
-        ASSERT_NE(nullptr, pContext);
-        BuiltInFixture::SetUp(pDevice);
-        ASSERT_NE(nullptr, pBuiltIns);
-    }
+void KernelCommandsTest::TearDown() {
+    mockKernelWithInternal.reset(nullptr);
+    BuiltInFixture::TearDown();
+    ContextFixture::TearDown();
+    DeviceFixture::TearDown();
+}

-    void TearDown() override {
-        BuiltInFixture::TearDown();
-        ContextFixture::TearDown();
-        DeviceFixture::TearDown();
-    }
-
-    size_t sizeRequiredCS;
-    size_t sizeRequiredISH;
-};
+void KernelCommandsTest::addSpaceForSingleKernelArg() {
+    kernelArguments.resize(1);
+    kernelArguments[0] = kernelArgInfo;
+    mockKernelWithInternal->kernelInfo.resizeKernelArgInfoAndRegisterParameter(1);
+    mockKernelWithInternal->kernelInfo.kernelArgInfo.resize(1);
+    mockKernelWithInternal->kernelInfo.kernelArgInfo[0].kernelArgPatchInfoVector.resize(1);
+    mockKernelWithInternal->kernelInfo.kernelArgInfo[0].kernelArgPatchInfoVector[0].crossthreadOffset = 0;
+    mockKernelWithInternal->kernelInfo.kernelArgInfo[0].kernelArgPatchInfoVector[0].size = sizeof(uintptr_t);
+    mockKernelWithInternal->mockKernel->setKernelArguments(kernelArguments);
+    mockKernelWithInternal->mockKernel->kernelArgRequiresCacheFlush.resize(1);
+}

 HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, programInterfaceDescriptorDataResourceUsage) {
    CommandQueueHw<FamilyType> cmdQ(pContext, pDevice, 0);
@ -202,11 +198,10 @@ HWTEST_F(KernelCommandsTest, givenIndirectHeapNotAllocatedFromInternalPoolWhenSe
    auto nonInternalAllocation = pDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties(MockAllocationProperties{MemoryConstants::pageSize});
    IndirectHeap indirectHeap(nonInternalAllocation, false);

-    MockKernelWithInternals mockKernelWithInternal(*pDevice);
-    auto sizeCrossThreadData = mockKernelWithInternal.mockKernel->getCrossThreadDataSize();
+    auto sizeCrossThreadData = mockKernelWithInternal->mockKernel->getCrossThreadDataSize();
    auto offset = KernelCommandsHelper<FamilyType>::sendCrossThreadData(
        indirectHeap,
-        *mockKernelWithInternal.mockKernel,
+        *mockKernelWithInternal->mockKernel,
        false,
        nullptr,
        sizeCrossThreadData);
@ -219,11 +214,10 @@ HWTEST_F(KernelCommandsTest, givenIndirectHeapAllocatedFromInternalPoolWhenSendC
    IndirectHeap indirectHeap(internalAllocation, true);
    auto expectedOffset = internalAllocation->getGpuAddressToPatch();

-    MockKernelWithInternals mockKernelWithInternal(*pDevice);
-    auto sizeCrossThreadData = mockKernelWithInternal.mockKernel->getCrossThreadDataSize();
+    auto sizeCrossThreadData = mockKernelWithInternal->mockKernel->getCrossThreadDataSize();
    auto offset = KernelCommandsHelper<FamilyType>::sendCrossThreadData(
        indirectHeap,
-        *mockKernelWithInternal.mockKernel,
+        *mockKernelWithInternal->mockKernel,
        false,
        nullptr,
        sizeCrossThreadData);
@ -358,7 +352,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, sendIndirectStateResourceUsage)
    EXPECT_GE(sizeRequiredSSH, usedAfterSSH - usedBeforeSSH);

    auto usedAfterCS = commandStream.getUsed();
-    EXPECT_GE(KernelCommandsHelper<FamilyType>::getSizeRequiredCS(), usedAfterCS - usedBeforeCS);
+    EXPECT_GE(KernelCommandsHelper<FamilyType>::getSizeRequiredCS(kernel), usedAfterCS - usedBeforeCS);
 }

 HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, givenKernelWithFourBindingTableEntriesWhenIndirectStateIsEmittedThenInterfaceDescriptorContainsCorrectBindingTableEntryCount) {
@ -370,10 +364,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, givenKernelWithFourBindingTableE
    auto pWalkerCmd = static_cast<GPGPU_WALKER *>(commandStream.getSpace(sizeof(GPGPU_WALKER)));
    *pWalkerCmd = FamilyType::cmdInitGpgpuWalker;

-    MockKernelWithInternals mockKernel(*pDevice, pContext);
-
    auto expectedBindingTableCount = 3u;
-    mockKernel.mockKernel->numberOfBindingTableStates = expectedBindingTableCount;
+    mockKernelWithInternal->mockKernel->numberOfBindingTableStates = expectedBindingTableCount;

    auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192);
    auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192);
@ -386,8 +378,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, givenKernelWithFourBindingTableE
        dsh,
        ioh,
        ssh,
-        *mockKernel.mockKernel,
-        mockKernel.mockKernel->getKernelInfo().getMaxSimdSize(),
+        *mockKernelWithInternal->mockKernel,
+        mockKernelWithInternal->mockKernel->getKernelInfo().getMaxSimdSize(),
        localWorkSizes,
        0,
        interfaceDescriptorIndex,
@ -415,11 +407,9 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, givenKernelThatIsSchedulerWhenIn
    auto pWalkerCmd = static_cast<GPGPU_WALKER *>(commandStream.getSpace(sizeof(GPGPU_WALKER)));
    *pWalkerCmd = FamilyType::cmdInitGpgpuWalker;

-    MockKernelWithInternals mockKernel(*pDevice, pContext);
-
    auto expectedBindingTableCount = 3u;
-    mockKernel.mockKernel->numberOfBindingTableStates = expectedBindingTableCount;
-    auto isScheduler = const_cast<bool *>(&mockKernel.mockKernel->isSchedulerKernel);
+    mockKernelWithInternal->mockKernel->numberOfBindingTableStates = expectedBindingTableCount;
+    auto isScheduler = const_cast<bool *>(&mockKernelWithInternal->mockKernel->isSchedulerKernel);
    *isScheduler = true;

    auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192);
@ -433,8 +423,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, givenKernelThatIsSchedulerWhenIn
        dsh,
        ioh,
        ssh,
-        *mockKernel.mockKernel,
-        mockKernel.mockKernel->getKernelInfo().getMaxSimdSize(),
+        *mockKernelWithInternal->mockKernel,
+        mockKernelWithInternal->mockKernel->getKernelInfo().getMaxSimdSize(),
        localWorkSizes,
        0,
        interfaceDescriptorIndex,
@ -458,10 +448,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, givenKernelWith100BindingTableEn
    auto pWalkerCmd = static_cast<GPGPU_WALKER *>(commandStream.getSpace(sizeof(GPGPU_WALKER)));
    *pWalkerCmd = FamilyType::cmdInitGpgpuWalker;

-    MockKernelWithInternals mockKernel(*pDevice, pContext);
-
    auto expectedBindingTableCount = 100u;
-    mockKernel.mockKernel->numberOfBindingTableStates = expectedBindingTableCount;
+    mockKernelWithInternal->mockKernel->numberOfBindingTableStates = expectedBindingTableCount;

    auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192);
    auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192);
@ -474,8 +462,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, givenKernelWith100BindingTableEn
        dsh,
        ioh,
        ssh,
-        *mockKernel.mockKernel,
-        mockKernel.mockKernel->getKernelInfo().getMaxSimdSize(),
+        *mockKernelWithInternal->mockKernel,
+        mockKernelWithInternal->mockKernel->getKernelInfo().getMaxSimdSize(),
        localWorkSizes,
        0,
        interfaceDescriptorIndex,
@ -981,7 +969,6 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, GivenKernelWithSamplersWhenIndir
    using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER;

    CommandQueueHw<FamilyType> cmdQ(nullptr, pDevice, 0);
-    MockKernelWithInternals kernelInternals(*pDevice);
    const size_t localWorkSizes[3]{1, 1, 1};

    auto &commandStream = cmdQ.getCS(1024);
@ -1007,8 +994,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, GivenKernelWithSamplersWhenIndir
    memset(mockDsh, 6, borderColorSize);
    memset(mockDsh + borderColorSize, 8, borderColorSize);

-    kernelInternals.kernelInfo.heapInfo.pDsh = mockDsh;
-    kernelInternals.kernelInfo.patchInfo.samplerStateArray = &samplerStateArray;
+    mockKernelWithInternal->kernelInfo.heapInfo.pDsh = mockDsh;
+    mockKernelWithInternal->kernelInfo.patchInfo.samplerStateArray = &samplerStateArray;

    uint64_t interfaceDescriptorTableOffset = dsh.getUsed();
    dsh.getSpace(sizeof(INTERFACE_DESCRIPTOR_DATA));
@ -1024,16 +1011,15 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, GivenKernelWithSamplersWhenIndir
        pSamplerState[i].setIndirectStatePointer(0);
    }

-    MockKernel *kernel = new MockKernel(kernelInternals.mockProgram, kernelInternals.kernelInfo, *pDevice);
-    kernel->setCrossThreadData(kernelInternals.crossThreadData, sizeof(kernelInternals.crossThreadData));
-    kernel->setSshLocal(kernelInternals.sshLocal, sizeof(kernelInternals.sshLocal));
+    mockKernelWithInternal->mockKernel->setCrossThreadData(mockKernelWithInternal->crossThreadData, sizeof(mockKernelWithInternal->crossThreadData));
+    mockKernelWithInternal->mockKernel->setSshLocal(mockKernelWithInternal->sshLocal, sizeof(mockKernelWithInternal->sshLocal));
    uint32_t interfaceDescriptorIndex = 0;
    KernelCommandsHelper<FamilyType>::sendIndirectState(
        commandStream,
        dsh,
        ioh,
        ssh,
-        *kernel,
+        *mockKernelWithInternal->mockKernel,
        8,
        localWorkSizes,
        interfaceDescriptorTableOffset,
@ -1081,7 +1067,6 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, GivenKernelWithSamplersWhenIndir
        EXPECT_EQ(borderColorOffset, pSamplerStatesCopied[i].getIndirectStatePointer());
    }

-    delete kernel;
    delete[] mockDsh;
 }

@ -1207,11 +1192,10 @@ HWTEST_F(KernelCommandsTest, givenEnabledPassInlineDataWhenKernelAllowsInlineThe

    uint32_t crossThreadData[8];

-    MockKernelWithInternals mockKernelWithInternal(*pDevice);
-    const_cast<SPatchThreadPayload *>(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->PassInlineData = 1;
-    mockKernelWithInternal.mockKernel->setCrossThreadData(crossThreadData, sizeof(crossThreadData));
+    const_cast<SPatchThreadPayload *>(mockKernelWithInternal->kernelInfo.patchInfo.threadPayload)->PassInlineData = 1;
+    mockKernelWithInternal->mockKernel->setCrossThreadData(crossThreadData, sizeof(crossThreadData));

-    EXPECT_TRUE(KernelCommandsHelper<FamilyType>::inlineDataProgrammingRequired(*mockKernelWithInternal.mockKernel));
+    EXPECT_TRUE(KernelCommandsHelper<FamilyType>::inlineDataProgrammingRequired(*mockKernelWithInternal->mockKernel));
 }

 HWTEST_F(KernelCommandsTest, givenEnabledPassInlineDataWhenKernelDisallowsInlineThenReturnFalse) {
@ -1220,45 +1204,179 @@ HWTEST_F(KernelCommandsTest, givenEnabledPassInlineDataWhenKernelDisallowsInline

    uint32_t crossThreadData[8];

-    MockKernelWithInternals mockKernelWithInternal(*pDevice);
-    const_cast<SPatchThreadPayload *>(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->PassInlineData = 0;
-    mockKernelWithInternal.mockKernel->setCrossThreadData(crossThreadData, sizeof(crossThreadData));
+    const_cast<SPatchThreadPayload *>(mockKernelWithInternal->kernelInfo.patchInfo.threadPayload)->PassInlineData = 0;
+    mockKernelWithInternal->mockKernel->setCrossThreadData(crossThreadData, sizeof(crossThreadData));

-    EXPECT_FALSE(KernelCommandsHelper<FamilyType>::inlineDataProgrammingRequired(*mockKernelWithInternal.mockKernel));
+    EXPECT_FALSE(KernelCommandsHelper<FamilyType>::inlineDataProgrammingRequired(*mockKernelWithInternal->mockKernel));
 }

 HWTEST_F(KernelCommandsTest, whenLocalIdxInXDimPresentThenExpectLocalIdsInUseIsTrue) {
-    MockKernelWithInternals mockKernelWithInternal(*pDevice);
-    const_cast<SPatchThreadPayload *>(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDXPresent = 1;
-    const_cast<SPatchThreadPayload *>(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDYPresent = 0;
-    const_cast<SPatchThreadPayload *>(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDZPresent = 0;
+    const_cast<SPatchThreadPayload *>(mockKernelWithInternal->kernelInfo.patchInfo.threadPayload)->LocalIDXPresent = 1;
+    const_cast<SPatchThreadPayload *>(mockKernelWithInternal->kernelInfo.patchInfo.threadPayload)->LocalIDYPresent = 0;
+    const_cast<SPatchThreadPayload *>(mockKernelWithInternal->kernelInfo.patchInfo.threadPayload)->LocalIDZPresent = 0;

-    EXPECT_TRUE(KernelCommandsHelper<FamilyType>::kernelUsesLocalIds(*mockKernelWithInternal.mockKernel));
+    EXPECT_TRUE(KernelCommandsHelper<FamilyType>::kernelUsesLocalIds(*mockKernelWithInternal->mockKernel));
 }

 HWTEST_F(KernelCommandsTest, whenLocalIdxInYDimPresentThenExpectLocalIdsInUseIsTrue) {
-    MockKernelWithInternals mockKernelWithInternal(*pDevice);
-    const_cast<SPatchThreadPayload *>(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDXPresent = 0;
-    const_cast<SPatchThreadPayload *>(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDYPresent = 1;
-    const_cast<SPatchThreadPayload *>(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDZPresent = 0;
+    const_cast<SPatchThreadPayload *>(mockKernelWithInternal->kernelInfo.patchInfo.threadPayload)->LocalIDXPresent = 0;
+    const_cast<SPatchThreadPayload *>(mockKernelWithInternal->kernelInfo.patchInfo.threadPayload)->LocalIDYPresent = 1;
+    const_cast<SPatchThreadPayload *>(mockKernelWithInternal->kernelInfo.patchInfo.threadPayload)->LocalIDZPresent = 0;

-    EXPECT_TRUE(KernelCommandsHelper<FamilyType>::kernelUsesLocalIds(*mockKernelWithInternal.mockKernel));
+    EXPECT_TRUE(KernelCommandsHelper<FamilyType>::kernelUsesLocalIds(*mockKernelWithInternal->mockKernel));
 }

 HWTEST_F(KernelCommandsTest, whenLocalIdxInZDimPresentThenExpectLocalIdsInUseIsTrue) {
-    MockKernelWithInternals mockKernelWithInternal(*pDevice);
-    const_cast<SPatchThreadPayload *>(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDXPresent = 0;
-    const_cast<SPatchThreadPayload *>(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDYPresent = 0;
-    const_cast<SPatchThreadPayload *>(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDZPresent = 1;
+    const_cast<SPatchThreadPayload *>(mockKernelWithInternal->kernelInfo.patchInfo.threadPayload)->LocalIDXPresent = 0;
+    const_cast<SPatchThreadPayload *>(mockKernelWithInternal->kernelInfo.patchInfo.threadPayload)->LocalIDYPresent = 0;
+    const_cast<SPatchThreadPayload *>(mockKernelWithInternal->kernelInfo.patchInfo.threadPayload)->LocalIDZPresent = 1;

-    EXPECT_TRUE(KernelCommandsHelper<FamilyType>::kernelUsesLocalIds(*mockKernelWithInternal.mockKernel));
+    EXPECT_TRUE(KernelCommandsHelper<FamilyType>::kernelUsesLocalIds(*mockKernelWithInternal->mockKernel));
 }

 HWTEST_F(KernelCommandsTest, whenLocalIdxAreNotPresentThenExpectLocalIdsInUseIsFalse) {
-    MockKernelWithInternals mockKernelWithInternal(*pDevice);
-    const_cast<SPatchThreadPayload *>(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDXPresent = 0;
-    const_cast<SPatchThreadPayload *>(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDYPresent = 0;
-    const_cast<SPatchThreadPayload *>(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDZPresent = 0;
+    const_cast<SPatchThreadPayload *>(mockKernelWithInternal->kernelInfo.patchInfo.threadPayload)->LocalIDXPresent = 0;
+    const_cast<SPatchThreadPayload *>(mockKernelWithInternal->kernelInfo.patchInfo.threadPayload)->LocalIDYPresent = 0;
+    const_cast<SPatchThreadPayload *>(mockKernelWithInternal->kernelInfo.patchInfo.threadPayload)->LocalIDZPresent = 0;

-    EXPECT_FALSE(KernelCommandsHelper<FamilyType>::kernelUsesLocalIds(*mockKernelWithInternal.mockKernel));
+    EXPECT_FALSE(KernelCommandsHelper<FamilyType>::kernelUsesLocalIds(*mockKernelWithInternal->mockKernel));
+}
+
+HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, givenCacheFlushAfterWalkerEnabledWhenProgramGlobalSurfacePresentThenExpectCacheFlushCommand) {
+    using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
+    using MEDIA_STATE_FLUSH = typename FamilyType::MEDIA_STATE_FLUSH;
+    using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename FamilyType::MEDIA_INTERFACE_DESCRIPTOR_LOAD;
+
+    DebugManagerStateRestore dbgRestore;
+    DebugManager.flags.EnableCacheFlushAfterWalker.set(1);
+
+    CommandQueueHw<FamilyType> cmdQ(nullptr, pDevice, 0);
+    auto &commandStream = cmdQ.getCS(1024);
+
+    MockGraphicsAllocation globalAllocation;
+    mockKernelWithInternal->mockProgram->setGlobalSurface(&globalAllocation);
+
+    size_t expectedSize = 2 * sizeof(MEDIA_STATE_FLUSH) + sizeof(MEDIA_INTERFACE_DESCRIPTOR_LOAD) + sizeof(PIPE_CONTROL);
+    size_t actualSize = KernelCommandsHelper<FamilyType>::getSizeRequiredCS(mockKernelWithInternal->mockKernel);
+    EXPECT_EQ(expectedSize, actualSize);
+
+    KernelCommandsHelper<FamilyType>::programCacheFlushAfterWalkerCommand(&commandStream, mockKernelWithInternal->mockKernel);
+
+    HardwareParse hwParse;
+    hwParse.parseCommands<FamilyType>(commandStream);
+    PIPE_CONTROL *pipeControl = hwParse.getCommand<PIPE_CONTROL>();
+    ASSERT_NE(nullptr, pipeControl);
+    EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable());
+    EXPECT_TRUE(pipeControl->getDcFlushEnable());
+
+    mockKernelWithInternal->mockProgram->setGlobalSurface(nullptr);
+}
+
+HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, givenCacheFlushAfterWalkerEnabledWhenSvmAllocationsSetAsCacheFlushRequiringThenExpectCacheFlushCommand) {
+    using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
+    using MEDIA_STATE_FLUSH = typename FamilyType::MEDIA_STATE_FLUSH;
+    using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename FamilyType::MEDIA_INTERFACE_DESCRIPTOR_LOAD;
+
+    DebugManagerStateRestore dbgRestore;
+    DebugManager.flags.EnableCacheFlushAfterWalker.set(1);
+
+    CommandQueueHw<FamilyType> cmdQ(nullptr, pDevice, 0);
+    auto &commandStream = cmdQ.getCS(1024);
+
+    mockKernelWithInternal->mockKernel->svmAllocationsRequireCacheFlush = true;
+
+    size_t expectedSize = 2 * sizeof(MEDIA_STATE_FLUSH) + sizeof(MEDIA_INTERFACE_DESCRIPTOR_LOAD) + sizeof(PIPE_CONTROL);
+    size_t actualSize = KernelCommandsHelper<FamilyType>::getSizeRequiredCS(mockKernelWithInternal->mockKernel);
+    EXPECT_EQ(expectedSize, actualSize);
+
+    KernelCommandsHelper<FamilyType>::programCacheFlushAfterWalkerCommand(&commandStream, mockKernelWithInternal->mockKernel);
+
+    HardwareParse hwParse;
+    hwParse.parseCommands<FamilyType>(commandStream);
+    PIPE_CONTROL *pipeControl = hwParse.getCommand<PIPE_CONTROL>();
+    ASSERT_NE(nullptr, pipeControl);
+    EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable());
+    EXPECT_TRUE(pipeControl->getDcFlushEnable());
+}
+
+HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, givenCacheFlushAfterWalkerEnabledWhenKernelArgIsSetAsCacheFlushRequiredThenExpectCacheFlushCommand) {
+    using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
+    using MEDIA_STATE_FLUSH = typename FamilyType::MEDIA_STATE_FLUSH;
+    using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename FamilyType::MEDIA_INTERFACE_DESCRIPTOR_LOAD;
+
+    DebugManagerStateRestore dbgRestore;
+    DebugManager.flags.EnableCacheFlushAfterWalker.set(1);
+
+    CommandQueueHw<FamilyType> cmdQ(nullptr, pDevice, 0);
+    auto &commandStream = cmdQ.getCS(1024);
+
+    addSpaceForSingleKernelArg();
+    MockGraphicsAllocation cacheRequiringAllocation;
+    mockKernelWithInternal->mockKernel->kernelArgRequiresCacheFlush[0] = &cacheRequiringAllocation;
+
+    size_t expectedSize = 2 * sizeof(MEDIA_STATE_FLUSH) + sizeof(MEDIA_INTERFACE_DESCRIPTOR_LOAD) + sizeof(PIPE_CONTROL);
+    size_t actualSize = KernelCommandsHelper<FamilyType>::getSizeRequiredCS(mockKernelWithInternal->mockKernel);
+    EXPECT_EQ(expectedSize, actualSize);
+
+    KernelCommandsHelper<FamilyType>::programCacheFlushAfterWalkerCommand(&commandStream, mockKernelWithInternal->mockKernel);
+
+    HardwareParse hwParse;
+    hwParse.parseCommands<FamilyType>(commandStream);
+    PIPE_CONTROL *pipeControl = hwParse.getCommand<PIPE_CONTROL>();
+    ASSERT_NE(nullptr, pipeControl);
+    EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable());
+    EXPECT_TRUE(pipeControl->getDcFlushEnable());
+}
+
+HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, givenCacheFlushAfterWalkerEnabledWhenNoGlobalSurfaceSvmAllocationKernelArgRequireCacheFlushThenExpectNoCacheFlushCommand) {
+    using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
+    using MEDIA_STATE_FLUSH = typename FamilyType::MEDIA_STATE_FLUSH;
+    using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename FamilyType::MEDIA_INTERFACE_DESCRIPTOR_LOAD;
+
+    DebugManagerStateRestore dbgRestore;
+    DebugManager.flags.EnableCacheFlushAfterWalker.set(1);
+
+    CommandQueueHw<FamilyType> cmdQ(nullptr, pDevice, 0);
+    auto &commandStream = cmdQ.getCS(1024);
+
+    addSpaceForSingleKernelArg();
+
+    size_t expectedSize = 2 * sizeof(MEDIA_STATE_FLUSH) + sizeof(MEDIA_INTERFACE_DESCRIPTOR_LOAD);
+    size_t actualSize = KernelCommandsHelper<FamilyType>::getSizeRequiredCS(mockKernelWithInternal->mockKernel);
+    EXPECT_EQ(expectedSize, actualSize);
+
+    KernelCommandsHelper<FamilyType>::programCacheFlushAfterWalkerCommand(&commandStream, mockKernelWithInternal->mockKernel);
+
+    HardwareParse hwParse;
+    hwParse.parseCommands<FamilyType>(commandStream);
+    PIPE_CONTROL *pipeControl = hwParse.getCommand<PIPE_CONTROL>();
+    EXPECT_EQ(nullptr, pipeControl);
+}
+
+HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, givenCacheFlushAfterWalkerEnabledWhenPlatformNotSupportFlushThenExpectNoCacheFlushCommand) {
+    using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
+    using MEDIA_STATE_FLUSH = typename FamilyType::MEDIA_STATE_FLUSH;
+    using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename FamilyType::MEDIA_INTERFACE_DESCRIPTOR_LOAD;
+
+    DebugManagerStateRestore dbgRestore;
+    DebugManager.flags.EnableCacheFlushAfterWalker.set(-1);
+    hwInfoHelper.capabilityTable.supportCacheFlushAfterWalker = false;
+
+    CommandQueueHw<FamilyType> cmdQ(nullptr, pDevice, 0);
+    auto &commandStream = cmdQ.getCS(1024);
+
+    addSpaceForSingleKernelArg();
+    MockGraphicsAllocation cacheRequiringAllocation;
+    mockKernelWithInternal->mockKernel->kernelArgRequiresCacheFlush[0] = &cacheRequiringAllocation;
+
+    size_t expectedSize = 2 * sizeof(MEDIA_STATE_FLUSH) + sizeof(MEDIA_INTERFACE_DESCRIPTOR_LOAD);
+    size_t actualSize = KernelCommandsHelper<FamilyType>::getSizeRequiredCS(mockKernelWithInternal->mockKernel);
+    EXPECT_EQ(expectedSize, actualSize);
+
+    KernelCommandsHelper<FamilyType>::programCacheFlushAfterWalkerCommand(&commandStream, mockKernelWithInternal->mockKernel);
+
+    HardwareParse hwParse;
+    hwParse.parseCommands<FamilyType>(commandStream);
+    PIPE_CONTROL *pipeControl = hwParse.getCommand<PIPE_CONTROL>();
+    EXPECT_EQ(nullptr, pipeControl);
 }
--- a/unit_tests/helpers/kernel_commands_tests.h
+++ b/unit_tests/helpers/kernel_commands_tests.h
@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ */
+
+#include "runtime/built_ins/built_ins.h"
+#include "runtime/kernel/kernel.h"
+#include "unit_tests/fixtures/context_fixture.h"
+#include "unit_tests/fixtures/device_fixture.h"
+#include "unit_tests/fixtures/built_in_fixture.h"
+#include "unit_tests/mocks/mock_context.h"
+#include "unit_tests/mocks/mock_graphics_allocation.h"
+#include "unit_tests/mocks/mock_kernel.h"
+#include "unit_tests/mocks/mock_program.h"
+#include "test.h"
+
+#include <memory>
+
+using namespace OCLRT;
+
+struct KernelCommandsTest : DeviceFixture,
+                            ContextFixture,
+                            BuiltInFixture,
+                            ::testing::Test {
+
+    using BuiltInFixture::SetUp;
+    using ContextFixture::SetUp;
+
+    void SetUp() override;
+    void TearDown() override;
+
+    void addSpaceForSingleKernelArg();
+
+    size_t sizeRequiredCS;
+    size_t sizeRequiredISH;
+
+    std::unique_ptr<MockKernelWithInternals> mockKernelWithInternal;
+    Kernel::SimpleKernelArgInfo kernelArgInfo = {};
+    std::vector<Kernel::SimpleKernelArgInfo> kernelArguments;
+};
--- a/unit_tests/kernel/clone_kernel_tests.cpp
+++ b/unit_tests/kernel/clone_kernel_tests.cpp
@ -508,7 +508,7 @@ TEST_F(CloneKernelTest, cloneKernelWithArgImmediate) {
 }

 TEST_F(CloneKernelTest, cloneKernelWithExecInfo) {
-    void *ptrSVM = pContext->getSVMAllocsManager()->createSVMAlloc(256);
+    void *ptrSVM = pContext->getSVMAllocsManager()->createSVMAlloc(256, false, false);
    ASSERT_NE(nullptr, ptrSVM);

    GraphicsAllocation *pSvmAlloc = pContext->getSVMAllocsManager()->getSVMAlloc(ptrSVM);
--- a/unit_tests/kernel/kernel_arg_buffer_tests.cpp
+++ b/unit_tests/kernel/kernel_arg_buffer_tests.cpp
@ -167,3 +167,42 @@ TEST_F(KernelArgBufferTest, given32BitDeviceWhenArgPassedIsNullThenOnly4BytesAre
    EXPECT_EQ(0u, *pKernelArg32bit);
    EXPECT_NE(expValue, *pKernelArg64bit);
 }
+
+TEST_F(KernelArgBufferTest, givenWritebleBufferWhenSettingAsArgThenExpectAllocationInCacheFlushVector) {
+    auto buffer = std::make_unique<MockBuffer>();
+    buffer->mockGfxAllocation.setMemObjectsAllocationWithWritableFlags(true);
+    buffer->mockGfxAllocation.flushL3Required = false;
+
+    auto val = static_cast<cl_mem>(buffer.get());
+    auto pVal = &val;
+
+    auto retVal = pKernel->setArg(0, sizeof(cl_mem *), pVal);
+    EXPECT_EQ(CL_SUCCESS, retVal);
+    EXPECT_EQ(&buffer->mockGfxAllocation, pKernel->kernelArgRequiresCacheFlush[0]);
+}
+
+TEST_F(KernelArgBufferTest, givenCacheFlushBufferWhenSettingAsArgThenExpectAllocationInCacheFlushVector) {
+    auto buffer = std::make_unique<MockBuffer>();
+    buffer->mockGfxAllocation.setMemObjectsAllocationWithWritableFlags(false);
+    buffer->mockGfxAllocation.flushL3Required = true;
+
+    auto val = static_cast<cl_mem>(buffer.get());
+    auto pVal = &val;
+
+    auto retVal = pKernel->setArg(0, sizeof(cl_mem *), pVal);
+    EXPECT_EQ(CL_SUCCESS, retVal);
+    EXPECT_EQ(&buffer->mockGfxAllocation, pKernel->kernelArgRequiresCacheFlush[0]);
+}
+
+TEST_F(KernelArgBufferTest, givenNoCacheFlushBufferWhenSettingAsArgThenNotExpectAllocationInCacheFlushVector) {
+    auto buffer = std::make_unique<MockBuffer>();
+    buffer->mockGfxAllocation.setMemObjectsAllocationWithWritableFlags(false);
+    buffer->mockGfxAllocation.flushL3Required = false;
+
+    auto val = static_cast<cl_mem>(buffer.get());
+    auto pVal = &val;
+
+    auto retVal = pKernel->setArg(0, sizeof(cl_mem *), pVal);
+    EXPECT_EQ(CL_SUCCESS, retVal);
+    EXPECT_EQ(nullptr, pKernel->kernelArgRequiresCacheFlush[0]);
+}
--- a/unit_tests/kernel/kernel_arg_svm_tests.cpp
+++ b/unit_tests/kernel/kernel_arg_svm_tests.cpp
@ -412,3 +412,90 @@ HWTEST_TYPED_TEST(KernelArgSvmTestTyped, GivenBufferKernelArgWhenBufferOffsetIsN

    alignedFree(svmPtr);
 }
+
+TEST_F(KernelArgSvmTest, givenWritebleSvmAllocationWhenSettingAsArgThenExpectAllocationInCacheFlushVector) {
+    size_t svmSize = 4096;
+    void *svmPtr = alignedMalloc(svmSize, MemoryConstants::pageSize);
+    MockGraphicsAllocation svmAlloc(svmPtr, svmSize);
+
+    svmAlloc.setMemObjectsAllocationWithWritableFlags(true);
+    svmAlloc.flushL3Required = false;
+
+    auto retVal = pKernel->setArgSvmAlloc(0, svmPtr, &svmAlloc);
+    EXPECT_EQ(CL_SUCCESS, retVal);
+    EXPECT_EQ(&svmAlloc, pKernel->kernelArgRequiresCacheFlush[0]);
+
+    alignedFree(svmPtr);
+}
+
+TEST_F(KernelArgSvmTest, givenCacheFlushSvmAllocationWhenSettingAsArgThenExpectAllocationInCacheFlushVector) {
+    size_t svmSize = 4096;
+    void *svmPtr = alignedMalloc(svmSize, MemoryConstants::pageSize);
+    MockGraphicsAllocation svmAlloc(svmPtr, svmSize);
+
+    svmAlloc.setMemObjectsAllocationWithWritableFlags(false);
+    svmAlloc.flushL3Required = true;
+
+    auto retVal = pKernel->setArgSvmAlloc(0, svmPtr, &svmAlloc);
+    EXPECT_EQ(CL_SUCCESS, retVal);
+    EXPECT_EQ(&svmAlloc, pKernel->kernelArgRequiresCacheFlush[0]);
+
+    alignedFree(svmPtr);
+}
+
+TEST_F(KernelArgSvmTest, givenNoCacheFlushSvmAllocationWhenSettingAsArgThenNotExpectAllocationInCacheFlushVector) {
+    size_t svmSize = 4096;
+    void *svmPtr = alignedMalloc(svmSize, MemoryConstants::pageSize);
+    MockGraphicsAllocation svmAlloc(svmPtr, svmSize);
+
+    svmAlloc.setMemObjectsAllocationWithWritableFlags(false);
+    svmAlloc.flushL3Required = false;
+
+    auto retVal = pKernel->setArgSvmAlloc(0, svmPtr, &svmAlloc);
+    EXPECT_EQ(CL_SUCCESS, retVal);
+    EXPECT_EQ(nullptr, pKernel->kernelArgRequiresCacheFlush[0]);
+
+    alignedFree(svmPtr);
+}
+
+TEST_F(KernelArgSvmTest, givenWritableSvmAllocationWhenSettingKernelExecInfoThenExpectSvmFlushFlagTrue) {
+    size_t svmSize = 4096;
+    void *svmPtr = alignedMalloc(svmSize, MemoryConstants::pageSize);
+    MockGraphicsAllocation svmAlloc(svmPtr, svmSize);
+
+    svmAlloc.setMemObjectsAllocationWithWritableFlags(true);
+    svmAlloc.flushL3Required = false;
+
+    pKernel->setKernelExecInfo(&svmAlloc);
+    EXPECT_TRUE(pKernel->svmAllocationsRequireCacheFlush);
+
+    alignedFree(svmPtr);
+}
+
+TEST_F(KernelArgSvmTest, givenCacheFlushSvmAllocationWhenSettingKernelExecInfoThenExpectSvmFlushFlagTrue) {
+    size_t svmSize = 4096;
+    void *svmPtr = alignedMalloc(svmSize, MemoryConstants::pageSize);
+    MockGraphicsAllocation svmAlloc(svmPtr, svmSize);
+
+    svmAlloc.setMemObjectsAllocationWithWritableFlags(false);
+    svmAlloc.flushL3Required = true;
+
+    pKernel->setKernelExecInfo(&svmAlloc);
+    EXPECT_TRUE(pKernel->svmAllocationsRequireCacheFlush);
+
+    alignedFree(svmPtr);
+}
+
+TEST_F(KernelArgSvmTest, givenNoCacheFlushReadOnlySvmAllocationWhenSettingKernelExecInfoThenExpectSvmFlushFlagFalse) {
+    size_t svmSize = 4096;
+    void *svmPtr = alignedMalloc(svmSize, MemoryConstants::pageSize);
+    MockGraphicsAllocation svmAlloc(svmPtr, svmSize);
+
+    svmAlloc.setMemObjectsAllocationWithWritableFlags(false);
+    svmAlloc.flushL3Required = false;
+
+    pKernel->setKernelExecInfo(&svmAlloc);
+    EXPECT_FALSE(pKernel->svmAllocationsRequireCacheFlush);
+
+    alignedFree(svmPtr);
+}
--- a/unit_tests/kernel/kernel_image_arg_tests.cpp
+++ b/unit_tests/kernel/kernel_image_arg_tests.cpp
@ -259,3 +259,39 @@ TEST_F(KernelImageArgTest, givenKernelWithSharedImageWhenSetArgCalledThenUsingSh
    EXPECT_TRUE(pKernel->getKernelArguments()[0].isPatched);
    EXPECT_TRUE(pKernel->isUsingSharedObjArgs());
 }
+
+TEST_F(KernelImageArgTest, givenWritebleImageWhenSettingAsArgThenExpectAllocationInCacheFlushVector) {
+    MockImageBase image;
+    image.graphicsAllocation->setMemObjectsAllocationWithWritableFlags(true);
+    image.graphicsAllocation->flushL3Required = false;
+
+    cl_mem imageObj = &image;
+
+    pKernel->setArg(0, sizeof(imageObj), &imageObj);
+    EXPECT_EQ(CL_SUCCESS, retVal);
+    EXPECT_EQ(image.graphicsAllocation, pKernel->kernelArgRequiresCacheFlush[0]);
+}
+
+TEST_F(KernelImageArgTest, givenCacheFlushImageWhenSettingAsArgThenExpectAllocationInCacheFlushVector) {
+    MockImageBase image;
+    image.graphicsAllocation->setMemObjectsAllocationWithWritableFlags(false);
+    image.graphicsAllocation->flushL3Required = true;
+
+    cl_mem imageObj = &image;
+
+    pKernel->setArg(0, sizeof(imageObj), &imageObj);
+    EXPECT_EQ(CL_SUCCESS, retVal);
+    EXPECT_EQ(image.graphicsAllocation, pKernel->kernelArgRequiresCacheFlush[0]);
+}
+
+TEST_F(KernelImageArgTest, givenNoCacheFlushImageWhenSettingAsArgThenExpectAllocationInCacheFlushVector) {
+    MockImageBase image;
+    image.graphicsAllocation->setMemObjectsAllocationWithWritableFlags(false);
+    image.graphicsAllocation->flushL3Required = false;
+
+    cl_mem imageObj = &image;
+
+    pKernel->setArg(0, sizeof(imageObj), &imageObj);
+    EXPECT_EQ(CL_SUCCESS, retVal);
+    EXPECT_EQ(nullptr, pKernel->kernelArgRequiresCacheFlush[0]);
+}
--- a/unit_tests/kernel/kernel_tests.cpp
+++ b/unit_tests/kernel/kernel_tests.cpp
@ -2355,3 +2355,101 @@ TEST(KernelTest, givenDebugVariableSetWhenKernelHasStatefulBufferAccessThenMarkK
    kernel.mockKernel->initialize();
    EXPECT_TRUE(kernel.mockKernel->isAuxTranslationRequired());
 }
+
+TEST(KernelTest, whenNullAllocationThenAssignNullPointerToCacheFlushVector) {
+    auto device = std::unique_ptr<MockDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(platformDevices[0]));
+    MockKernelWithInternals kernel(*device);
+    kernel.mockKernel->kernelArgRequiresCacheFlush.resize(1);
+    kernel.mockKernel->kernelArgRequiresCacheFlush[0] = reinterpret_cast<GraphicsAllocation *>(0x1);
+
+    kernel.mockKernel->addAllocationToCacheFlushVector(0, nullptr);
+    EXPECT_EQ(nullptr, kernel.mockKernel->kernelArgRequiresCacheFlush[0]);
+}
+
+TEST(KernelTest, whenAllocationRequiringCacheFlushThenAssignAllocationPointerToCacheFlushVector) {
+    MockGraphicsAllocation mockAllocation;
+    auto device = std::unique_ptr<MockDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(platformDevices[0]));
+    MockKernelWithInternals kernel(*device);
+    kernel.mockKernel->kernelArgRequiresCacheFlush.resize(1);
+
+    mockAllocation.setMemObjectsAllocationWithWritableFlags(false);
+    mockAllocation.flushL3Required = true;
+
+    kernel.mockKernel->addAllocationToCacheFlushVector(0, &mockAllocation);
+    EXPECT_EQ(&mockAllocation, kernel.mockKernel->kernelArgRequiresCacheFlush[0]);
+}
+
+TEST(KernelTest, whenAllocationWriteableThenAssignAllocationPointerToCacheFlushVector) {
+    MockGraphicsAllocation mockAllocation;
+    auto device = std::unique_ptr<MockDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(platformDevices[0]));
+    MockKernelWithInternals kernel(*device);
+    kernel.mockKernel->kernelArgRequiresCacheFlush.resize(1);
+
+    mockAllocation.setMemObjectsAllocationWithWritableFlags(true);
+    mockAllocation.flushL3Required = false;
+
+    kernel.mockKernel->addAllocationToCacheFlushVector(0, &mockAllocation);
+    EXPECT_EQ(&mockAllocation, kernel.mockKernel->kernelArgRequiresCacheFlush[0]);
+}
+
+TEST(KernelTest, whenAllocationReadOnlyNonFlushRequiredThenAssignNullPointerToCacheFlushVector) {
+    MockGraphicsAllocation mockAllocation;
+    auto device = std::unique_ptr<MockDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(platformDevices[0]));
+    MockKernelWithInternals kernel(*device);
+    kernel.mockKernel->kernelArgRequiresCacheFlush.resize(1);
+    kernel.mockKernel->kernelArgRequiresCacheFlush[0] = reinterpret_cast<GraphicsAllocation *>(0x1);
+
+    mockAllocation.setMemObjectsAllocationWithWritableFlags(false);
+    mockAllocation.flushL3Required = false;
+
+    kernel.mockKernel->addAllocationToCacheFlushVector(0, &mockAllocation);
+    EXPECT_EQ(nullptr, kernel.mockKernel->kernelArgRequiresCacheFlush[0]);
+}
+
+TEST(KernelTest, givenEnableCacheFlushFlagIsEnableWhenPlatformDoesNotSupportThenOverrideAndReturnSupportTrue) {
+    DebugManagerStateRestore restore;
+    DebugManager.flags.EnableCacheFlushAfterWalker.set(1);
+
+    HardwareInfo localHwInfo = *platformDevices[0];
+    localHwInfo.capabilityTable.supportCacheFlushAfterWalker = false;
+
+    auto device = std::unique_ptr<MockDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(&localHwInfo));
+    MockKernelWithInternals kernel(*device);
+    EXPECT_TRUE(kernel.mockKernel->platformSupportCacheFlushAfterWalker());
+}
+
+TEST(KernelTest, givenEnableCacheFlushFlagIsDisableWhenPlatformSupportsThenOverrideAndReturnSupportFalse) {
+    DebugManagerStateRestore restore;
+    DebugManager.flags.EnableCacheFlushAfterWalker.set(0);
+
+    HardwareInfo localHwInfo = *platformDevices[0];
+    localHwInfo.capabilityTable.supportCacheFlushAfterWalker = true;
+
+    auto device = std::unique_ptr<MockDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(&localHwInfo));
+    MockKernelWithInternals kernel(*device);
+    EXPECT_FALSE(kernel.mockKernel->platformSupportCacheFlushAfterWalker());
+}
+
+TEST(KernelTest, givenEnableCacheFlushFlagIsReadPlatformSettingWhenPlatformDoesNotSupportThenReturnSupportFalse) {
+    DebugManagerStateRestore restore;
+    DebugManager.flags.EnableCacheFlushAfterWalker.set(-1);
+
+    HardwareInfo localHwInfo = *platformDevices[0];
+    localHwInfo.capabilityTable.supportCacheFlushAfterWalker = false;
+
+    auto device = std::unique_ptr<MockDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(&localHwInfo));
+    MockKernelWithInternals kernel(*device);
+    EXPECT_FALSE(kernel.mockKernel->platformSupportCacheFlushAfterWalker());
+}
+
+TEST(KernelTest, givenEnableCacheFlushFlagIsReadPlatformSettingWhenPlatformSupportsThenReturnSupportTrue) {
+    DebugManagerStateRestore restore;
+    DebugManager.flags.EnableCacheFlushAfterWalker.set(-1);
+
+    HardwareInfo localHwInfo = *platformDevices[0];
+    localHwInfo.capabilityTable.supportCacheFlushAfterWalker = true;
+
+    auto device = std::unique_ptr<MockDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(&localHwInfo));
+    MockKernelWithInternals kernel(*device);
+    EXPECT_TRUE(kernel.mockKernel->platformSupportCacheFlushAfterWalker());
+}
--- a/unit_tests/mem_obj/buffer_set_arg_tests.cpp
+++ b/unit_tests/mem_obj/buffer_set_arg_tests.cpp
@ -260,7 +260,7 @@ TEST_F(BufferSetArgTest, clSetKernelArgBuffer) {
 }

 TEST_F(BufferSetArgTest, clSetKernelArgSVMPointer) {
-    void *ptrSVM = pContext->getSVMAllocsManager()->createSVMAlloc(256);
+    void *ptrSVM = pContext->getSVMAllocsManager()->createSVMAlloc(256, false, false);
    EXPECT_NE(nullptr, ptrSVM);

    GraphicsAllocation *pSvmAlloc = pContext->getSVMAllocsManager()->getSVMAlloc(ptrSVM);
--- a/unit_tests/mem_obj/buffer_tests.cpp
+++ b/unit_tests/mem_obj/buffer_tests.cpp
@ -494,7 +494,7 @@ TEST_F(RenderCompressedBuffersTests, givenDebugVariableSetWhenHwFlagIsNotSetThen
 TEST_F(RenderCompressedBuffersTests, givenSvmAllocationWhenCreatingBufferThenForceDisableCompression) {
    localHwInfo.capabilityTable.ftrRenderCompressedBuffers = true;

-    auto svmAlloc = context->getSVMAllocsManager()->createSVMAlloc(sizeof(uint32_t), false);
+    auto svmAlloc = context->getSVMAllocsManager()->createSVMAlloc(sizeof(uint32_t), false, false);

    buffer.reset(Buffer::create(context.get(), CL_MEM_USE_HOST_PTR, sizeof(uint32_t), svmAlloc, retVal));
    EXPECT_EQ(buffer->getGraphicsAllocation()->getAllocationType(), GraphicsAllocation::AllocationType::BUFFER_HOST_MEMORY);
@ -878,7 +878,7 @@ TEST_P(ValidHostPtr, failedAllocationInjection) {
 TEST_P(ValidHostPtr, SvmHostPtr) {
    const DeviceInfo &devInfo = pDevice->getDeviceInfo();
    if (devInfo.svmCapabilities != 0) {
-        auto ptr = context->getSVMAllocsManager()->createSVMAlloc(64, false);
+        auto ptr = context->getSVMAllocsManager()->createSVMAlloc(64, false, false);

        auto bufferSvm = Buffer::create(context.get(), CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, 64, ptr, retVal);
        EXPECT_NE(nullptr, bufferSvm);
--- a/unit_tests/memory_manager/svm_memory_manager.cpp
+++ b/unit_tests/memory_manager/svm_memory_manager.cpp
@ -31,7 +31,7 @@ TEST_F(SVMMemoryAllocatorTest, SVMAllocCreateNullFreeNull) {
    OsAgnosticMemoryManager memoryManager(false, false, executionEnvironment);
    {
        SVMAllocsManager svmM(&memoryManager);
-        char *Ptr1 = (char *)svmM.createSVMAlloc(0);
+        char *Ptr1 = (char *)svmM.createSVMAlloc(0, false, false);
        EXPECT_EQ(Ptr1, nullptr);
        svmM.freeSVMAlloc(nullptr);
    }
@ -42,7 +42,7 @@ TEST_F(SVMMemoryAllocatorTest, SVMAllocCreateFree) {
    OsAgnosticMemoryManager memoryManager(false, false, executionEnvironment);
    {
        SVMAllocsManager svmM(&memoryManager);
-        char *Ptr1 = (char *)svmM.createSVMAlloc(4096);
+        char *Ptr1 = (char *)svmM.createSVMAlloc(4096, false, false);
        EXPECT_NE(Ptr1, nullptr);

        svmM.freeSVMAlloc(Ptr1);
@ -72,7 +72,7 @@ TEST_F(SVMMemoryAllocatorTest, SVMAllocGetBeforeAndInside) {
    OsAgnosticMemoryManager memoryManager(false, false, executionEnvironment);
    {
        SVMAllocsManager svmM(&memoryManager);
-        char *Ptr1 = (char *)svmM.createSVMAlloc(4096);
+        char *Ptr1 = (char *)svmM.createSVMAlloc(4096, false, false);
        EXPECT_NE(Ptr1, nullptr);

        char *Ptr2 = Ptr1 - 4;
@ -93,7 +93,7 @@ TEST_F(SVMMemoryAllocatorTest, SVMAllocgetAfterSVM) {
    OsAgnosticMemoryManager memoryManager(false, false, executionEnvironment);
    {
        SVMAllocsManager svmM(&memoryManager);
-        char *Ptr1 = (char *)svmM.createSVMAlloc(4096);
+        char *Ptr1 = (char *)svmM.createSVMAlloc(4096, false, false);
        EXPECT_NE(Ptr1, nullptr);

        char *Ptr2 = Ptr1 + 4096 + 100;
@ -129,7 +129,7 @@ TEST_F(SVMMemoryAllocatorTest, WhenCouldNotAllocateInMemoryManagerThenReturnsNul
    MockMemManager memoryManager(executionEnvironment);
    {
        MockSVMAllocsManager svmM{&memoryManager};
-        void *svmPtr = svmM.createSVMAlloc(512);
+        void *svmPtr = svmM.createSVMAlloc(512, false, false);
        EXPECT_EQ(nullptr, svmPtr);

        EXPECT_EQ(0U, svmM.GetSVMAllocs().getNumAllocs());
@ -151,3 +151,28 @@ TEST_F(SVMMemoryAllocatorTest, given64kbAllowedwhenAllocatingSvmMemoryThenDontPr
    myMemoryManager.allocateGraphicsMemoryForSVM(1, false);
    EXPECT_FALSE(myMemoryManager.preferRenderCompressedFlag);
 }
+
+TEST_F(SVMMemoryAllocatorTest, whenReadOnlyFlagIsPresentThenReturnTrue) {
+    EXPECT_TRUE(SVMAllocsManager::memFlagIsReadOnly(CL_MEM_READ_ONLY));
+    EXPECT_TRUE(SVMAllocsManager::memFlagIsReadOnly(CL_MEM_HOST_READ_ONLY));
+    EXPECT_TRUE(SVMAllocsManager::memFlagIsReadOnly(CL_MEM_READ_ONLY));
+}
+
+TEST_F(SVMMemoryAllocatorTest, whenNoReadOnlyFlagIsPresentThenReturnFalse) {
+    EXPECT_FALSE(SVMAllocsManager::memFlagIsReadOnly(CL_MEM_READ_WRITE));
+    EXPECT_FALSE(SVMAllocsManager::memFlagIsReadOnly(CL_MEM_WRITE_ONLY));
+}
+
+TEST_F(SVMMemoryAllocatorTest, whenReadOnlySvmAllocationCreatedThenGraphicsAllocationHasWriteableFlagFalse) {
+    ExecutionEnvironment executionEnvironment;
+    OsAgnosticMemoryManager memoryManager(false, false, executionEnvironment);
+    SVMAllocsManager svmM(&memoryManager);
+    void *svm = svmM.createSVMAlloc(4096, false, true);
+    EXPECT_NE(nullptr, svm);
+
+    GraphicsAllocation *svmAllocation = svmM.getSVMAlloc(svm);
+    EXPECT_NE(nullptr, svmAllocation);
+    EXPECT_FALSE(svmAllocation->isMemObjectsAllocationWithWritableFlags());
+
+    svmM.freeSVMAlloc(svm);
+}
--- a/unit_tests/mocks/mock_kernel.h
+++ b/unit_tests/mocks/mock_kernel.h
@ -23,10 +23,14 @@ namespace OCLRT {
 ////////////////////////////////////////////////////////////////////////////////
 class MockKernel : public Kernel {
  public:
+    using Kernel::addAllocationToCacheFlushVector;
    using Kernel::auxTranslationRequired;
    using Kernel::isSchedulerKernel;
+    using Kernel::kernelArgRequiresCacheFlush;
    using Kernel::kernelArguments;
    using Kernel::numberOfBindingTableStates;
+    using Kernel::platformSupportCacheFlushAfterWalker;
+    using Kernel::svmAllocationsRequireCacheFlush;

    struct BlockPatchValues {
        uint64_t offset;
@ -256,6 +260,7 @@ class MockKernelWithInternals {
        threadPayload.LocalIDZPresent = 1;
        kernelInfo.heapInfo.pKernelHeap = kernelIsa;
        kernelInfo.heapInfo.pSsh = sshLocal;
+        kernelInfo.heapInfo.pDsh = dshLocal;
        kernelInfo.heapInfo.pKernelHeader = &kernelHeader;
        kernelInfo.patchInfo.dataParameterStream = &dataParameterStream;
        kernelInfo.patchInfo.executionEnvironment = &executionEnvironment;
@ -298,6 +303,7 @@ class MockKernelWithInternals {
    uint32_t kernelIsa[32];
    char crossThreadData[256];
    char sshLocal[128];
+    char dshLocal[128];
 };

 class MockParentKernel : public Kernel {
--- a/unit_tests/profiling/profiling_tests.cpp
+++ b/unit_tests/profiling/profiling_tests.cpp
@ -63,10 +63,10 @@ struct ProfilingTests : public CommandEnqueueFixture,

    std::unique_ptr<MockProgram> program;

-    SKernelBinaryHeaderCommon kernelHeader;
-    SPatchDataParameterStream dataParameterStream;
+    SKernelBinaryHeaderCommon kernelHeader = {};
+    SPatchDataParameterStream dataParameterStream = {};
    SPatchExecutionEnvironment executionEnvironment = {};
-    SPatchThreadPayload threadPayload;
+    SPatchThreadPayload threadPayload = {};
    KernelInfo kernelInfo;

    uint32_t kernelIsa[32];
@ -78,15 +78,17 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GIVENCommandQueueWithProfilingAndFor
    typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL;
    typedef typename FamilyType::GPGPU_WALKER GPGPU_WALKER;

-    uint64_t requiredSize = 2 * sizeof(PIPE_CONTROL) + 2 * sizeof(MI_STORE_REGISTER_MEM) + sizeof(GPGPU_WALKER) + KernelCommandsHelper<FamilyType>::getSizeRequiredCS();
+    MockKernel kernel(program.get(), kernelInfo, *pDevice);

-    auto &commandStreamNDRangeKernel = getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*pCmdQ, true, false, nullptr);
-    auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, true, false, *pCmdQ, nullptr);
+    uint64_t requiredSize = 2 * sizeof(PIPE_CONTROL) + 2 * sizeof(MI_STORE_REGISTER_MEM) + sizeof(GPGPU_WALKER) + KernelCommandsHelper<FamilyType>::getSizeRequiredCS(&kernel);
+
+    auto &commandStreamNDRangeKernel = getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*pCmdQ, true, false, &kernel);
+    auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, true, false, *pCmdQ, &kernel);
    EXPECT_GE(expectedSizeCS, requiredSize);
    EXPECT_GE(commandStreamNDRangeKernel.getAvailableSpace(), requiredSize);

-    auto &commandStreamTask = getCommandStream<FamilyType, CL_COMMAND_TASK>(*pCmdQ, true, false, nullptr);
-    expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_TASK, true, false, *pCmdQ, nullptr);
+    auto &commandStreamTask = getCommandStream<FamilyType, CL_COMMAND_TASK>(*pCmdQ, true, false, &kernel);
+    expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_TASK, true, false, *pCmdQ, &kernel);
    EXPECT_GE(expectedSizeCS, requiredSize);
    EXPECT_GE(commandStreamTask.getAvailableSpace(), requiredSize);
 }
@ -114,16 +116,17 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GIVENCommandQueueWithProfilingAndFor
    typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL;
    typedef typename FamilyType::GPGPU_WALKER GPGPU_WALKER;

-    uint64_t requiredSize = 2 * sizeof(PIPE_CONTROL) + 4 * sizeof(MI_STORE_REGISTER_MEM) + KernelCommandsHelper<FamilyType>::getSizeRequiredCS();
+    MockKernel kernel(program.get(), kernelInfo, *pDevice);
+
+    uint64_t requiredSize = 2 * sizeof(PIPE_CONTROL) + 4 * sizeof(MI_STORE_REGISTER_MEM) + KernelCommandsHelper<FamilyType>::getSizeRequiredCS(&kernel);
    requiredSize += 2 * sizeof(GPGPU_WALKER);

-    MockKernel kernel(program.get(), kernelInfo, *pDevice);
    DispatchInfo dispatchInfo;
    dispatchInfo.setKernel(&kernel);
    MultiDispatchInfo multiDispatchInfo;
    multiDispatchInfo.push(dispatchInfo);
    multiDispatchInfo.push(dispatchInfo);
-    auto &commandStreamTask = getCommandStream<FamilyType, CL_COMMAND_TASK>(*pCmdQ, true, false, nullptr);
+    auto &commandStreamTask = getCommandStream<FamilyType, CL_COMMAND_TASK>(*pCmdQ, true, false, &kernel);
    auto expectedSizeCS = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_TASK, 0, true, false, *pCmdQ, multiDispatchInfo);
    EXPECT_GE(expectedSizeCS, requiredSize);
    EXPECT_GE(commandStreamTask.getAvailableSpace(), requiredSize);
@ -525,19 +528,21 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingWithPerfCountersTests, GIVENCommandQueueWit

    pCmdQ->setPerfCountersEnabled(true, 1);

-    uint64_t requiredSize = 2 * sizeof(PIPE_CONTROL) + 4 * sizeof(MI_STORE_REGISTER_MEM) + sizeof(GPGPU_WALKER) + KernelCommandsHelper<FamilyType>::getSizeRequiredCS();
+    MockKernel kernel(program.get(), kernelInfo, *pDevice);
+
+    uint64_t requiredSize = 2 * sizeof(PIPE_CONTROL) + 4 * sizeof(MI_STORE_REGISTER_MEM) + sizeof(GPGPU_WALKER) + KernelCommandsHelper<FamilyType>::getSizeRequiredCS(&kernel);
    //begin perf cmds
    requiredSize += 2 * sizeof(PIPE_CONTROL) + 2 * sizeof(MI_STORE_REGISTER_MEM) + OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(MI_STORE_REGISTER_MEM) + sizeof(MI_REPORT_PERF_COUNT) + pCmdQ->getPerfCountersUserRegistersNumber() * sizeof(MI_STORE_REGISTER_MEM);
    //end perf cmds
    requiredSize += 2 * sizeof(PIPE_CONTROL) + 3 * sizeof(MI_STORE_REGISTER_MEM) + OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(MI_STORE_REGISTER_MEM) + sizeof(MI_REPORT_PERF_COUNT) + pCmdQ->getPerfCountersUserRegistersNumber() * sizeof(MI_STORE_REGISTER_MEM);

-    auto &commandStreamNDRangeKernel = getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*pCmdQ, true, true, nullptr);
-    auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, true, true, *pCmdQ, nullptr);
+    auto &commandStreamNDRangeKernel = getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*pCmdQ, true, true, &kernel);
+    auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, true, true, *pCmdQ, &kernel);
    EXPECT_GE(expectedSizeCS, requiredSize);
    EXPECT_GE(commandStreamNDRangeKernel.getAvailableSpace(), requiredSize);

-    auto &commandStreamTask = getCommandStream<FamilyType, CL_COMMAND_TASK>(*pCmdQ, true, true, nullptr);
-    expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_TASK, true, true, *pCmdQ, nullptr);
+    auto &commandStreamTask = getCommandStream<FamilyType, CL_COMMAND_TASK>(*pCmdQ, true, true, &kernel);
+    expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_TASK, true, true, *pCmdQ, &kernel);
    EXPECT_GE(expectedSizeCS, requiredSize);
    EXPECT_GE(commandStreamTask.getAvailableSpace(), requiredSize);
    bool retVal = false;
@ -576,9 +581,11 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingWithPerfCountersTests, GIVENCommandQueueWit
    typedef typename FamilyType::GPGPU_WALKER GPGPU_WALKER;
    typedef typename FamilyType::MI_REPORT_PERF_COUNT MI_REPORT_PERF_COUNT;

+    MockKernel kernel(program.get(), kernelInfo, *pDevice);
+
    pCmdQ->setPerfCountersEnabled(true, 1);

-    uint64_t requiredSize = 2 * sizeof(PIPE_CONTROL) + 4 * sizeof(MI_STORE_REGISTER_MEM) + KernelCommandsHelper<FamilyType>::getSizeRequiredCS();
+    uint64_t requiredSize = 2 * sizeof(PIPE_CONTROL) + 4 * sizeof(MI_STORE_REGISTER_MEM) + KernelCommandsHelper<FamilyType>::getSizeRequiredCS(&kernel);
    requiredSize += 2 * sizeof(GPGPU_WALKER);

    //begin perf cmds
@ -586,13 +593,12 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingWithPerfCountersTests, GIVENCommandQueueWit
    //end perf cmds
    requiredSize += 2 * sizeof(PIPE_CONTROL) + 3 * sizeof(MI_STORE_REGISTER_MEM) + OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(MI_STORE_REGISTER_MEM) + sizeof(MI_REPORT_PERF_COUNT) + pCmdQ->getPerfCountersUserRegistersNumber() * sizeof(MI_STORE_REGISTER_MEM);

-    MockKernel kernel(program.get(), kernelInfo, *pDevice);
    DispatchInfo dispatchInfo;
    dispatchInfo.setKernel(&kernel);
    MultiDispatchInfo multiDispatchInfo;
    multiDispatchInfo.push(dispatchInfo);
    multiDispatchInfo.push(dispatchInfo);
-    auto &commandStreamTask = getCommandStream<FamilyType, CL_COMMAND_TASK>(*pCmdQ, true, true, nullptr);
+    auto &commandStreamTask = getCommandStream<FamilyType, CL_COMMAND_TASK>(*pCmdQ, true, true, &kernel);
    auto expectedSizeCS = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_TASK, 0, true, true, *pCmdQ, multiDispatchInfo);
    EXPECT_GE(expectedSizeCS, requiredSize);
    EXPECT_GE(commandStreamTask.getAvailableSpace(), requiredSize);
--- a/unit_tests/test_files/igdrcl.config
+++ b/unit_tests/test_files/igdrcl.config
@ -101,3 +101,4 @@ EnableMakeResidentOnMapGpuVa = 0
 RenderCompressedImagesEnabled = -1
 RenderCompressedBuffersEnabled = -1
 AUBDumpForceAllToLocalMemory = 0
+EnableCacheFlushAfterWalker = 0