diff --git a/runtime/command_queue/local_id_gen.cpp b/runtime/command_queue/local_id_gen.cpp
index e02aa53c15..9cae96dbd0 100644
--- a/runtime/command_queue/local_id_gen.cpp
+++ b/runtime/command_queue/local_id_gen.cpp
@@ -57,9 +57,12 @@ LocalIDHelper::LocalIDHelper() {
 LocalIDHelper LocalIDHelper::initializer;
 
 //traditional function to generate local IDs
-void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder) {
+void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool hasKernelOnlyImages) {
     auto threadsPerWorkGroup = static_cast<uint16_t>(getThreadsPerWG(simd, localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]));
-    if (simd == 32) {
+    bool use4x4Layout = hasKernelOnlyImages && isCompatibleWith4x4Layout(localWorkgroupSize, dimensionsOrder, simd);
+    if (use4x4Layout) {
+        generateLocalIDsWith4x4Layout(buffer, localWorkgroupSize, simd);
+    } else if (simd == 32) {
         LocalIDHelper::generateSimd32(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder);
     } else if (simd == 16) {
         LocalIDHelper::generateSimd16(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder);
@@ -67,4 +70,57 @@ void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3>
         LocalIDHelper::generateSimd8(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder);
     }
 }
+
+bool isCompatibleWith4x4Layout(const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, uint16_t simd) {
+    uint8_t rowWidth = simd == 32u ? 32u : 16u;
+    uint8_t xDelta = simd == 8u ? 2u : 4u;
+    uint8_t yDelta = simd == 8u || localWorkgroupSize.at(1) == 4u ? 4u : rowWidth / xDelta;
+    return dimensionsOrder.at(0) == 0 &&
+           dimensionsOrder.at(1) == 1 &&
+           (localWorkgroupSize.at(0) & (xDelta - 1)) == 0 &&
+           (localWorkgroupSize.at(1) & (yDelta - 1)) == 0;
+}
+
+inline void generateLocalIDsWith4x4Layout(void *b, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t simd) {
+    uint8_t rowWidth = simd == 32u ? 32u : 16u;
+    uint8_t xDelta = simd == 8u ? 2u : 4u;                                                  // difference between corresponding values in consecutive X rows
+    uint8_t yDelta = simd == 8u || localWorkgroupSize.at(1) == 4u ? 4u : rowWidth / xDelta; // difference between corresponding values in consecutive Y rows
+    std::array<uint16_t, 3> replicationFactors{{static_cast<uint16_t>(localWorkgroupSize.at(0) / xDelta),
+                                                static_cast<uint16_t>(localWorkgroupSize.at(1) / yDelta),
+                                                static_cast<uint16_t>(localWorkgroupSize.at(2))}};
+    bool earlyGrowX = replicationFactors.at(1) == 1 && simd == 32u && replicationFactors.at(0) > 1;
+    bool earlyGrowZ = replicationFactors.at(1) == 1 && simd == 32u && !earlyGrowX && replicationFactors.at(2) > 1;
+    auto buffer = reinterpret_cast<uint16_t *>(b);
+    uint16_t offset = 0u;
+    for (uint16_t z = 0u; z < replicationFactors.at(2); z++) {
+        for (uint16_t y = 0u; y < replicationFactors.at(1); y++) {
+            for (uint16_t x = 0u; x < replicationFactors.at(0); x++) {
+                // row for X
+                for (uint8_t i = 0u; i < simd; i++) {
+                    if (earlyGrowX && i == yDelta * xDelta) {
+                        x++;
+                    }
+                    auto xValue = xDelta * x + (i & (xDelta - 1));
+                    buffer[offset + i] = xValue & (localWorkgroupSize.at(0) - 1);
+                }
+                offset += rowWidth;
+                // row for Y
+                for (uint8_t i = 0u; i < simd; i++) {
+                    auto yValue = yDelta * y + i / xDelta;
+                    buffer[offset + i] = yValue & (localWorkgroupSize.at(1) - 1);
+                }
+                offset += rowWidth;
+                // row for Z
+                for (uint8_t i = 0u; i < simd; i++) {
+                    if (earlyGrowZ && i == yDelta * xDelta) {
+                        z++;
+                    }
+                    auto zValue = z;
+                    buffer[offset + i] = zValue & (localWorkgroupSize.at(2) - 1);
+                }
+                offset += rowWidth;
+            }
+        }
+    }
+}
 } // namespace OCLRT
diff --git a/runtime/command_queue/local_id_gen.h b/runtime/command_queue/local_id_gen.h
index c4d4ca225a..454a68a9a5 100644
--- a/runtime/command_queue/local_id_gen.h
+++ b/runtime/command_queue/local_id_gen.h
@@ -79,6 +79,8 @@ void generateLocalIDsSimd(void *b, const std::array<uint16_t, 3> &localWorkgroup
                           const std::array<uint8_t, 3> &dimensionsOrder);
 
 void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize,
-                      const std::array<uint8_t, 3> &dimensionsOrder);
+                      const std::array<uint8_t, 3> &dimensionsOrder, bool hasKernelOnlyImages);
+void generateLocalIDsWith4x4Layout(void *b, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t simd);
 
+bool isCompatibleWith4x4Layout(const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, uint16_t simd);
 } // namespace OCLRT
\ No newline at end of file
diff --git a/runtime/helpers/aligned_memory.h b/runtime/helpers/aligned_memory.h
index 6377430d4d..85c7102eea 100644
--- a/runtime/helpers/aligned_memory.h
+++ b/runtime/helpers/aligned_memory.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Intel Corporation
+ * Copyright (c) 2017 - 2018, Intel Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -28,6 +28,8 @@
 #include <cstdint>
 #include <cstddef>
 #include <algorithm>
+#include <memory>
+#include <functional>
 
 #ifdef _MSC_VER
 #define ALIGNAS(x) __declspec(align(x))
@@ -112,3 +114,6 @@ template <typename T>
 inline bool isAligned(T *ptr) {
     return (reinterpret_cast<uintptr_t>(ptr) & (alignof(T) - 1)) == 0;
 }
+inline auto allocateAlignedMemory(size_t bytes, size_t alignment) {
+    return std::unique_ptr<void, std::function<decltype(alignedFree)>>(alignedMalloc(bytes, alignment), alignedFree);
+}
diff --git a/runtime/helpers/kernel_commands.inl b/runtime/helpers/kernel_commands.inl
index a8fdae5742..be18ed5a3c 100644
--- a/runtime/helpers/kernel_commands.inl
+++ b/runtime/helpers/kernel_commands.inl
@@ -375,7 +375,8 @@ size_t KernelCommandsHelper<GfxFamily>::sendIndirectState(
         simd,
         numChannels,
         localWorkSize,
-        kernel.getKernelInfo().workgroupDimensionsOrder);
+        kernel.getKernelInfo().workgroupDimensionsOrder,
+        kernel.usesOnlyImages());
 
     // send interface descriptor data
     auto localWorkItems = localWorkSize[0] * localWorkSize[1] * localWorkSize[2];
diff --git a/runtime/helpers/per_thread_data.cpp b/runtime/helpers/per_thread_data.cpp
index 732caa7574..1d270602b9 100644
--- a/runtime/helpers/per_thread_data.cpp
+++ b/runtime/helpers/per_thread_data.cpp
@@ -33,7 +33,8 @@ size_t PerThreadDataHelper::sendPerThreadData(
     uint32_t simd,
     uint32_t numChannels,
     const size_t localWorkSizes[3],
-    const std::array<uint8_t, 3> &workgroupWalkOrder) {
+    const std::array<uint8_t, 3> &workgroupWalkOrder,
+    bool hasKernelOnlyImages) {
     auto offsetPerThreadData = indirectHeap.getUsed();
     if (numChannels) {
         auto localWorkSize = localWorkSizes[0] * localWorkSizes[1] * localWorkSizes[2];
@@ -46,7 +47,8 @@ size_t PerThreadDataHelper::sendPerThreadData(
                          std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizes[0]),
                                                   static_cast<uint16_t>(localWorkSizes[1]),
                                                   static_cast<uint16_t>(localWorkSizes[2])}},
-                         std::array<uint8_t, 3>{{workgroupWalkOrder[0], workgroupWalkOrder[1], workgroupWalkOrder[2]}});
+                         std::array<uint8_t, 3>{{workgroupWalkOrder[0], workgroupWalkOrder[1], workgroupWalkOrder[2]}},
+                         hasKernelOnlyImages);
     }
     return offsetPerThreadData;
 }
diff --git a/runtime/helpers/per_thread_data.h b/runtime/helpers/per_thread_data.h
index e8ba8cfd4e..99a9884286 100644
--- a/runtime/helpers/per_thread_data.h
+++ b/runtime/helpers/per_thread_data.h
@@ -49,15 +49,8 @@ struct PerThreadDataHelper {
         uint32_t simd,
         uint32_t numChannels,
         const size_t localWorkSizes[3],
-        const std::array<uint8_t, 3> &workgroupWalkOrder);
-
-    static size_t sendPerThreadData(
-        LinearStream &indirectHeap,
-        uint32_t simd,
-        uint32_t numChannels,
-        const size_t localWorkSizes[3]) {
-        return sendPerThreadData(indirectHeap, simd, numChannels, localWorkSizes, std::array<uint8_t, 3>{{0, 1, 2}});
-    }
+        const std::array<uint8_t, 3> &workgroupWalkOrder,
+        bool hasKernelOnlyImages);
 
     static inline uint32_t getNumLocalIdChannels(const iOpenCL::SPatchThreadPayload &threadPayload) {
         return threadPayload.LocalIDXPresent +
diff --git a/unit_tests/command_queue/local_id_tests.cpp b/unit_tests/command_queue/local_id_tests.cpp
index d14101ad8b..0da0ce918f 100644
--- a/unit_tests/command_queue/local_id_tests.cpp
+++ b/unit_tests/command_queue/local_id_tests.cpp
@@ -248,20 +248,20 @@ struct LocalIDFixture : public ::testing::TestWithParam<std::tuple<int, int, int
 
 TEST_P(LocalIDFixture, checkIDWithinLimits) {
     generateLocalIDs(buffer, simd, std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizeX), static_cast<uint16_t>(localWorkSizeY), static_cast<uint16_t>(localWorkSizeZ)}},
-                     std::array<uint8_t, 3>{{0, 1, 2}});
+                     std::array<uint8_t, 3>{{0, 1, 2}}, false);
     validateIDWithinLimits(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ);
 }
 
 TEST_P(LocalIDFixture, checkAllWorkItemsCovered) {
     generateLocalIDs(buffer, simd, std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizeX), static_cast<uint16_t>(localWorkSizeY), static_cast<uint16_t>(localWorkSizeZ)}},
-                     std::array<uint8_t, 3>{{0, 1, 2}});
+                     std::array<uint8_t, 3>{{0, 1, 2}}, false);
     validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ);
 }
 
 TEST_P(LocalIDFixture, WhenWalkOrderIsXyzThenProperLocalIdsAreGenerated) {
     auto dimensionsOrder = std::array<uint8_t, 3>{{0, 1, 2}};
     generateLocalIDs(buffer, simd, std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizeX), static_cast<uint16_t>(localWorkSizeY), static_cast<uint16_t>(localWorkSizeZ)}},
-                     dimensionsOrder);
+                     dimensionsOrder, false);
     validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ);
     validateWalkOrder(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, dimensionsOrder);
 }
@@ -269,7 +269,7 @@ TEST_P(LocalIDFixture, WhenWalkOrderIsXyzThenProperLocalIdsAreGenerated) {
 TEST_P(LocalIDFixture, WhenWalkOrderIsYxzThenProperLocalIdsAreGenerated) {
     auto dimensionsOrder = std::array<uint8_t, 3>{{1, 0, 2}};
     generateLocalIDs(buffer, simd, std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizeX), static_cast<uint16_t>(localWorkSizeY), static_cast<uint16_t>(localWorkSizeZ)}},
-                     dimensionsOrder);
+                     dimensionsOrder, false);
     validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ);
     validateWalkOrder(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, dimensionsOrder);
 }
@@ -277,7 +277,7 @@ TEST_P(LocalIDFixture, WhenWalkOrderIsYxzThenProperLocalIdsAreGenerated) {
 TEST_P(LocalIDFixture, WhenWalkOrderIsZyxThenProperLocalIdsAreGenerated) {
     auto dimensionsOrder = std::array<uint8_t, 3>{{2, 1, 0}};
     generateLocalIDs(buffer, simd, std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizeX), static_cast<uint16_t>(localWorkSizeY), static_cast<uint16_t>(localWorkSizeZ)}},
-                     dimensionsOrder);
+                     dimensionsOrder, false);
     validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ);
     validateWalkOrder(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, dimensionsOrder);
 }
@@ -296,6 +296,199 @@ TEST_P(LocalIDFixture, sizeCalculationLocalIDs) {
     EXPECT_EQ(numGRFsExpected * sizeGRF, sizeTotalPerThreadData);
 }
 
+using LocalIds4x4LayoutTest = ::testing::TestWithParam<uint8_t>;
+
+TEST(LocalIds4x4LayoutTest, given4x4x1LocalWorkSizeWithDefaultDimensionsOrderWhenCheck2x4CompatibilityThenReturnTrue) {
+    std::array<uint16_t, 3> localWorkSize{{4u, 4u, 1u}};
+    std::array<uint8_t, 3> dimensionsOrder = {{0u, 1u, 2u}};
+    EXPECT_TRUE(isCompatibleWith4x4Layout(localWorkSize, dimensionsOrder, 16));
+}
+
+TEST(LocalIds4x4LayoutTest, givenNonCompatible4x4x1LocalWorkSizeWithDefaultDimensionsOrderWhenCheck2x4CompatibilityThenReturnFalse) {
+    std::array<uint8_t, 3> dimensionsOrder = {{0u, 1u, 2u}};
+    EXPECT_FALSE(isCompatibleWith4x4Layout({{2u, 5u, 1u}}, dimensionsOrder, 8));
+    EXPECT_FALSE(isCompatibleWith4x4Layout({{1u, 4u, 1u}}, dimensionsOrder, 8));
+}
+TEST(LocalIds4x4LayoutTest, given4x4x1LocalWorkSizeWithNonDefaultDimensionsOrderWhenCheck2x4CompatibilityThenReturnFalse) {
+    std::array<uint16_t, 3> localWorkSize{{2u, 4u, 1u}};
+    EXPECT_FALSE(isCompatibleWith4x4Layout(localWorkSize, {{0, 2, 1}}, 8));
+    EXPECT_FALSE(isCompatibleWith4x4Layout(localWorkSize, {{1, 0, 2}}, 8));
+    EXPECT_FALSE(isCompatibleWith4x4Layout(localWorkSize, {{1, 2, 0}}, 8));
+    EXPECT_FALSE(isCompatibleWith4x4Layout(localWorkSize, {{2, 0, 1}}, 8));
+    EXPECT_FALSE(isCompatibleWith4x4Layout(localWorkSize, {{2, 1, 0}}, 8));
+}
+
+TEST_P(LocalIds4x4LayoutTest, givenLWS4x4x1WhenGenerateLocalIdsThenHasKernelImagesOnlyFlagDoesntMatter) {
+    uint16_t simd = GetParam();
+    uint8_t rowWidth = simd == 32 ? 32 : 16;
+    uint16_t xDelta = simd == 8u ? 2u : 4u;
+    std::array<uint16_t, 3> localWorkSize{{xDelta, 4u, 1u}};
+    uint16_t totalLocalWorkSize = 4u * xDelta;
+    auto dimensionsOrder = std::array<uint8_t, 3>{{0u, 1u, 2u}};
+
+    auto elemsInBuffer = rowWidth * 3u;
+    auto size = elemsInBuffer * sizeof(uint16_t);
+
+    auto alignedMemory1 = allocateAlignedMemory(size, 32);
+    auto buffer1 = reinterpret_cast<uint16_t *>(alignedMemory1.get());
+    memset(buffer1, 0xff, size);
+
+    auto alignedMemory2 = allocateAlignedMemory(size, 32);
+    auto buffer2 = reinterpret_cast<uint16_t *>(alignedMemory2.get());
+    memset(buffer2, 0xff, size);
+
+    generateLocalIDs(buffer1, simd, localWorkSize, dimensionsOrder, false);
+    generateLocalIDs(buffer2, simd, localWorkSize, dimensionsOrder, true);
+
+    for (auto i = 0u; i < elemsInBuffer / rowWidth; i++) {
+        for (auto j = 0u; j < rowWidth; j++) {
+            if (j < totalLocalWorkSize) {
+                auto offset = (i * rowWidth + j) * sizeof(uint16_t);
+                auto cmpValue = memcmp(ptrOffset(buffer1, offset), ptrOffset(buffer2, offset), sizeof(uint16_t));
+                EXPECT_EQ(0, cmpValue);
+            }
+        }
+    }
+}
+
+TEST_P(LocalIds4x4LayoutTest, givenLWS4x4x2WhenGenerateLocalIdsWithKernelWithOnlyImagesThenApplies4x4Layout) {
+    uint16_t simd = GetParam();
+    uint8_t rowWidth = simd == 32 ? 32 : 16;
+    uint16_t xDelta = simd == 8u ? 2u : 4u;
+    uint16_t zDelta = simd == 32u ? 2u : 1u;
+    std::array<uint16_t, 3> localWorkSize{4u, 4u, 2u};
+    auto dimensionsOrder = std::array<uint8_t, 3>{{0u, 1u, 2u}};
+    auto elemsInBuffer = 3u * localWorkSize.at(0) * localWorkSize.at(1) * localWorkSize.at(2);
+    if (simd == 8u) {
+        elemsInBuffer *= 2;
+    }
+    auto size = elemsInBuffer * sizeof(uint16_t);
+    auto alignedMemory = allocateAlignedMemory(size, 32);
+    auto buffer = reinterpret_cast<uint16_t *>(alignedMemory.get());
+    memset(buffer, 0xff, size);
+    EXPECT_TRUE(isCompatibleWith4x4Layout(localWorkSize, dimensionsOrder, simd));
+    generateLocalIDs(buffer, simd, localWorkSize, dimensionsOrder, true);
+
+    auto numRows = elemsInBuffer / rowWidth;
+    auto numGrfs = numRows / 3u;
+
+    for (auto i = 0u; i < numGrfs; i++) {
+
+        // validate X row
+        uint16_t baseX = buffer[i * 3 * rowWidth];
+        uint16_t currentX = baseX;
+        for (int j = 1; j < simd; j++) {
+            currentX = baseX + ((currentX + 1) & (xDelta - 1));
+            EXPECT_EQ(buffer[i * 3 * rowWidth + j], currentX);
+        }
+
+        // validate Y row
+        for (int j = 0; j < simd; j++) {
+            uint16_t expectedY = ((j / xDelta) & 0b11);
+            EXPECT_EQ(buffer[i * 3 * rowWidth + rowWidth + j], expectedY);
+        }
+
+        // validate Z row
+        for (int j = 0; j < simd; j++) {
+            uint16_t expectedZ = 2 * i / numGrfs + j / (simd / zDelta); //early grow Z
+            EXPECT_EQ(buffer[i * 3 * rowWidth + 2 * rowWidth + j], expectedZ);
+        }
+    }
+}
+
+TEST_P(LocalIds4x4LayoutTest, givenLWS8x4x2WhenGenerateLocalIdsWithKernelWithOnlyImagesThenApplies4x4Layout) {
+    uint16_t simd = GetParam();
+    uint8_t rowWidth = simd == 32 ? 32 : 16;
+    uint16_t xDelta = simd == 8u ? 2u : 4u;
+    std::array<uint16_t, 3> localWorkSize{8u, 4u, 2u};
+    auto dimensionsOrder = std::array<uint8_t, 3>{{0u, 1u, 2u}};
+    auto elemsInBuffer = 3u * localWorkSize.at(0) * localWorkSize.at(1) * localWorkSize.at(2);
+    if (simd == 8u) {
+        elemsInBuffer *= 2;
+    }
+    auto size = elemsInBuffer * sizeof(uint16_t);
+    auto alignedMemory = allocateAlignedMemory(size, 32);
+    auto buffer = reinterpret_cast<uint16_t *>(alignedMemory.get());
+    memset(buffer, 0xff, size);
+    EXPECT_TRUE(isCompatibleWith4x4Layout(localWorkSize, dimensionsOrder, simd));
+    generateLocalIDs(buffer, simd, localWorkSize, dimensionsOrder, true);
+
+    auto numRows = elemsInBuffer / rowWidth;
+    auto numGrfs = numRows / 3u;
+
+    for (auto i = 0u; i < numGrfs; i++) {
+
+        // validate X row
+        uint16_t baseX = buffer[i * 3 * rowWidth];
+        uint16_t currentX = baseX;
+        for (int j = 1; j < simd; j++) {
+            if (j == 16) {
+                //early grow X
+                baseX += xDelta;
+            }
+            currentX = baseX + ((currentX + 1) & (xDelta - 1));
+            EXPECT_EQ(buffer[i * 3 * rowWidth + j], currentX);
+        }
+
+        // validate Y row
+        for (int j = 0; j < simd; j++) {
+            uint16_t expectedY = ((j / xDelta) & 0b11);
+            EXPECT_EQ(buffer[i * 3 * rowWidth + rowWidth + j], expectedY);
+        }
+
+        // validate Z row
+        for (int j = 0; j < simd; j++) {
+            uint16_t expectedZ = 2 * i / numGrfs;
+            EXPECT_EQ(buffer[i * 3 * rowWidth + 2 * rowWidth + j], expectedZ);
+        }
+    }
+}
+
+TEST_P(LocalIds4x4LayoutTest, givenLWS8x8x2WhenGenerateLocalIdsWithKernelWithOnlyImagesThenApplies4x4Layout) {
+    uint16_t simd = GetParam();
+    uint8_t rowWidth = simd == 32 ? 32 : 16;
+    uint16_t xDelta = simd == 8u ? 2u : 4u;
+    std::array<uint16_t, 3> localWorkSize{8u, 8u, 2u};
+    auto dimensionsOrder = std::array<uint8_t, 3>{{0u, 1u, 2u}};
+    auto elemsInBuffer = 3u * localWorkSize.at(0) * localWorkSize.at(1) * localWorkSize.at(2);
+    if (simd == 8u) {
+        elemsInBuffer *= 2;
+    }
+    auto size = elemsInBuffer * sizeof(uint16_t);
+    auto alignedMemory = allocateAlignedMemory(size, 32);
+    auto buffer = reinterpret_cast<uint16_t *>(alignedMemory.get());
+    memset(buffer, 0xff, size);
+    EXPECT_TRUE(isCompatibleWith4x4Layout(localWorkSize, dimensionsOrder, simd));
+    generateLocalIDs(buffer, simd, localWorkSize, dimensionsOrder, true);
+
+    auto numRows = elemsInBuffer / rowWidth;
+    auto numGrfs = numRows / 3u;
+
+    for (auto i = 0u; i < numGrfs; i++) {
+
+        // validate X row
+        uint16_t baseX = buffer[i * 3 * rowWidth];
+        uint16_t currentX = baseX;
+        for (int j = 1; j < simd; j++) {
+            currentX = baseX + ((currentX + 1) & (xDelta - 1));
+            EXPECT_EQ(buffer[i * 3 * rowWidth + j], currentX);
+        }
+
+        // validate Y row
+        uint16_t baseY = buffer[i * 3 * rowWidth + rowWidth];
+        for (int j = 0; j < simd; j++) {
+            uint16_t expectedY = baseY + ((j / xDelta) & 0b111);
+            EXPECT_EQ(buffer[i * 3 * rowWidth + rowWidth + j], expectedY);
+        }
+
+        // validate Z row
+        for (int j = 0; j < simd; j++) {
+            uint16_t expectedZ = 2 * i / numGrfs;
+            EXPECT_EQ(buffer[i * 3 * rowWidth + 2 * rowWidth + j], expectedZ);
+        }
+    }
+}
+
 #define SIMDParams ::testing::Values(8, 16, 32)
 #if HEAVY_DUTY_TESTING
 #define LWSXParams ::testing::Values(1, 7, 8, 9, 15, 16, 17, 31, 32, 33, 64, 128, 256)
@@ -308,6 +501,7 @@ TEST_P(LocalIDFixture, sizeCalculationLocalIDs) {
 #endif
 
 INSTANTIATE_TEST_CASE_P(AllCombinations, LocalIDFixture, ::testing::Combine(SIMDParams, LWSXParams, LWSYParams, LWSZParams));
+INSTANTIATE_TEST_CASE_P(4x4LWSLayoutTests, LocalIds4x4LayoutTest, SIMDParams);
 
 // To debug a specific configuration replace the list of Values with specific values.
 // NOTE: You'll need a unique test prefix
diff --git a/unit_tests/helpers/kernel_commands_tests.cpp b/unit_tests/helpers/kernel_commands_tests.cpp
index de7ecbd794..f03a3dc754 100644
--- a/unit_tests/helpers/kernel_commands_tests.cpp
+++ b/unit_tests/helpers/kernel_commands_tests.cpp
@@ -514,7 +514,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, whenSendingIndirectStateThenKern
     auto expectedLocalIds = alignedMalloc(expectedIohSize, 64);
     generateLocalIDs(expectedLocalIds, modifiedKernelInfo.getMaxSimdSize(),
                      std::array<uint16_t, 3>{{localWorkSizeX, localWorkSizeY, localWorkSizeZ}},
-                     std::array<uint8_t, 3>{{modifiedKernelInfo.workgroupDimensionsOrder[0], modifiedKernelInfo.workgroupDimensionsOrder[1], modifiedKernelInfo.workgroupDimensionsOrder[2]}});
+                     std::array<uint8_t, 3>{{modifiedKernelInfo.workgroupDimensionsOrder[0], modifiedKernelInfo.workgroupDimensionsOrder[1], modifiedKernelInfo.workgroupDimensionsOrder[2]}}, false);
     EXPECT_EQ(0, memcmp(expectedLocalIds, ioh.getCpuBase(), expectedIohSize));
     alignedFree(expectedLocalIds);
 }
diff --git a/unit_tests/helpers/per_thread_data_tests.cpp b/unit_tests/helpers/per_thread_data_tests.cpp
index 1d29ae745f..a9d157e798 100644
--- a/unit_tests/helpers/per_thread_data_tests.cpp
+++ b/unit_tests/helpers/per_thread_data_tests.cpp
@@ -73,6 +73,8 @@ struct PerThreadDataTests : public DeviceFixture,
         alignedFree(indirectHeapMemory);
         DeviceFixture::TearDown();
     }
+
+    const std::array<uint8_t, 3> workgroupWalkOrder = {{0, 1, 2}};
     uint32_t simd;
     uint32_t numChannels;
     uint32_t kernelIsa[32];
@@ -107,7 +109,9 @@ HWTEST_F(PerThreadDataXYZTests, sendPerThreadData_256x1x1) {
         indirectHeap,
         simd,
         numChannels,
-        localWorkSizes);
+        localWorkSizes,
+        workgroupWalkOrder,
+        false);
     auto expectedPerThreadDataSizeTotal = PerThreadDataHelper::getPerThreadDataSizeTotal(simd, numChannels, localWorkSize);
 
     size_t sizeConsumed = indirectHeap.getUsed() - offsetPerThreadData;
@@ -123,7 +127,9 @@ HWTEST_F(PerThreadDataXYZTests, sendPerThreadData_2x4x8) {
         indirectHeap,
         simd,
         numChannels,
-        localWorkSizes);
+        localWorkSizes,
+        workgroupWalkOrder,
+        false);
 
     size_t sizeConsumed = indirectHeap.getUsed() - offsetPerThreadData;
     EXPECT_EQ(64u * (3u * 2u * 4u * 8u) / 32u, sizeConsumed);
@@ -174,7 +180,9 @@ HWTEST_F(PerThreadDataNoIdsTests, sendPerThreadDataDoesntSendAnyData) {
         indirectHeap,
         simd,
         numChannels,
-        localWorkSizes);
+        localWorkSizes,
+        workgroupWalkOrder,
+        false);
 
     size_t sizeConsumed = indirectHeap.getUsed() - offsetPerThreadData;
     EXPECT_EQ(0u, sizeConsumed);
@@ -245,7 +253,9 @@ TEST(PerThreadDataTest, generateLocalIDs) {
         stream,
         simd,
         numChannels,
-        localWorkSizes);
+        localWorkSizes,
+        {{0, 1, 2}},
+        false);
 
     // Check if buffer overrun happend, only first sizePerThreadDataTotal bytes can be overwriten, following should be same as reference.
     for (auto i = sizePerThreadDataTotal; i < sizeOverSizedBuffer; i += sizePerThreadDataTotal) {