Apply (2/4)x4x1 layout when generating local ids for kernel with images only

- For SIMD8 apply 2x4x1 layout - For SIMD16/SIMD32 apply 4x4x1 layout Change-Id: I31bceb49387011c66da5f96ad2a71125b96d4cda Signed-off-by: Mateusz Jablonski <mateusz.jablonski@intel.com>
2026-01-09 22:43:00 +08:00 · 2018-08-08 12:49:36 +02:00
parent cf971158c6
commit 47f3dad619
9 changed files with 289 additions and 26 deletions
--- a/runtime/command_queue/local_id_gen.cpp
+++ b/runtime/command_queue/local_id_gen.cpp
@@ -57,9 +57,12 @@ LocalIDHelper::LocalIDHelper() {
 LocalIDHelper LocalIDHelper::initializer;

 //traditional function to generate local IDs
-void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder) {
+void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool hasKernelOnlyImages) {
    auto threadsPerWorkGroup = static_cast<uint16_t>(getThreadsPerWG(simd, localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]));
-    if (simd == 32) {
+    bool use4x4Layout = hasKernelOnlyImages && isCompatibleWith4x4Layout(localWorkgroupSize, dimensionsOrder, simd);
+    if (use4x4Layout) {
+        generateLocalIDsWith4x4Layout(buffer, localWorkgroupSize, simd);
+    } else if (simd == 32) {
        LocalIDHelper::generateSimd32(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder);
    } else if (simd == 16) {
        LocalIDHelper::generateSimd16(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder);
@@ -67,4 +70,57 @@ void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3>
        LocalIDHelper::generateSimd8(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder);
    }
 }
+
+bool isCompatibleWith4x4Layout(const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, uint16_t simd) {
+    uint8_t rowWidth = simd == 32u ? 32u : 16u;
+    uint8_t xDelta = simd == 8u ? 2u : 4u;
+    uint8_t yDelta = simd == 8u || localWorkgroupSize.at(1) == 4u ? 4u : rowWidth / xDelta;
+    return dimensionsOrder.at(0) == 0 &&
+           dimensionsOrder.at(1) == 1 &&
+           (localWorkgroupSize.at(0) & (xDelta - 1)) == 0 &&
+           (localWorkgroupSize.at(1) & (yDelta - 1)) == 0;
+}
+
+inline void generateLocalIDsWith4x4Layout(void *b, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t simd) {
+    uint8_t rowWidth = simd == 32u ? 32u : 16u;
+    uint8_t xDelta = simd == 8u ? 2u : 4u;                                                  // difference between corresponding values in consecutive X rows
+    uint8_t yDelta = simd == 8u || localWorkgroupSize.at(1) == 4u ? 4u : rowWidth / xDelta; // difference between corresponding values in consecutive Y rows
+    std::array<uint16_t, 3> replicationFactors{{static_cast<uint16_t>(localWorkgroupSize.at(0) / xDelta),
+                                                static_cast<uint16_t>(localWorkgroupSize.at(1) / yDelta),
+                                                static_cast<uint16_t>(localWorkgroupSize.at(2))}};
+    bool earlyGrowX = replicationFactors.at(1) == 1 && simd == 32u && replicationFactors.at(0) > 1;
+    bool earlyGrowZ = replicationFactors.at(1) == 1 && simd == 32u && !earlyGrowX && replicationFactors.at(2) > 1;
+    auto buffer = reinterpret_cast<uint16_t *>(b);
+    uint16_t offset = 0u;
+    for (uint16_t z = 0u; z < replicationFactors.at(2); z++) {
+        for (uint16_t y = 0u; y < replicationFactors.at(1); y++) {
+            for (uint16_t x = 0u; x < replicationFactors.at(0); x++) {
+                // row for X
+                for (uint8_t i = 0u; i < simd; i++) {
+                    if (earlyGrowX && i == yDelta * xDelta) {
+                        x++;
+                    }
+                    auto xValue = xDelta * x + (i & (xDelta - 1));
+                    buffer[offset + i] = xValue & (localWorkgroupSize.at(0) - 1);
+                }
+                offset += rowWidth;
+                // row for Y
+                for (uint8_t i = 0u; i < simd; i++) {
+                    auto yValue = yDelta * y + i / xDelta;
+                    buffer[offset + i] = yValue & (localWorkgroupSize.at(1) - 1);
+                }
+                offset += rowWidth;
+                // row for Z
+                for (uint8_t i = 0u; i < simd; i++) {
+                    if (earlyGrowZ && i == yDelta * xDelta) {
+                        z++;
+                    }
+                    auto zValue = z;
+                    buffer[offset + i] = zValue & (localWorkgroupSize.at(2) - 1);
+                }
+                offset += rowWidth;
+            }
+        }
+    }
+}
 } // namespace OCLRT
--- a/runtime/command_queue/local_id_gen.h
+++ b/runtime/command_queue/local_id_gen.h
@@ -79,6 +79,8 @@ void generateLocalIDsSimd(void *b, const std::array<uint16_t, 3> &localWorkgroup
                          const std::array<uint8_t, 3> &dimensionsOrder);

 void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize,
-                      const std::array<uint8_t, 3> &dimensionsOrder);
+                      const std::array<uint8_t, 3> &dimensionsOrder, bool hasKernelOnlyImages);
+void generateLocalIDsWith4x4Layout(void *b, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t simd);

+bool isCompatibleWith4x4Layout(const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, uint16_t simd);
 } // namespace OCLRT
--- a/runtime/helpers/aligned_memory.h
+++ b/runtime/helpers/aligned_memory.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Intel Corporation
+ * Copyright (c) 2017 - 2018, Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -28,6 +28,8 @@
 #include <cstdint>
 #include <cstddef>
 #include <algorithm>
+#include <memory>
+#include <functional>

 #ifdef _MSC_VER
 #define ALIGNAS(x) __declspec(align(x))
@@ -112,3 +114,6 @@ template <typename T>
 inline bool isAligned(T *ptr) {
    return (reinterpret_cast<uintptr_t>(ptr) & (alignof(T) - 1)) == 0;
 }
+inline auto allocateAlignedMemory(size_t bytes, size_t alignment) {
+    return std::unique_ptr<void, std::function<decltype(alignedFree)>>(alignedMalloc(bytes, alignment), alignedFree);
+}
--- a/runtime/helpers/kernel_commands.inl
+++ b/runtime/helpers/kernel_commands.inl
@@ -375,7 +375,8 @@ size_t KernelCommandsHelper<GfxFamily>::sendIndirectState(
        simd,
        numChannels,
        localWorkSize,
-        kernel.getKernelInfo().workgroupDimensionsOrder);
+        kernel.getKernelInfo().workgroupDimensionsOrder,
+        kernel.usesOnlyImages());

    // send interface descriptor data
    auto localWorkItems = localWorkSize[0] * localWorkSize[1] * localWorkSize[2];
--- a/runtime/helpers/per_thread_data.cpp
+++ b/runtime/helpers/per_thread_data.cpp
@@ -33,7 +33,8 @@ size_t PerThreadDataHelper::sendPerThreadData(
    uint32_t simd,
    uint32_t numChannels,
    const size_t localWorkSizes[3],
-    const std::array<uint8_t, 3> &workgroupWalkOrder) {
+    const std::array<uint8_t, 3> &workgroupWalkOrder,
+    bool hasKernelOnlyImages) {
    auto offsetPerThreadData = indirectHeap.getUsed();
    if (numChannels) {
        auto localWorkSize = localWorkSizes[0] * localWorkSizes[1] * localWorkSizes[2];
@@ -46,7 +47,8 @@ size_t PerThreadDataHelper::sendPerThreadData(
                         std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizes[0]),
                                                  static_cast<uint16_t>(localWorkSizes[1]),
                                                  static_cast<uint16_t>(localWorkSizes[2])}},
-                         std::array<uint8_t, 3>{{workgroupWalkOrder[0], workgroupWalkOrder[1], workgroupWalkOrder[2]}});
+                         std::array<uint8_t, 3>{{workgroupWalkOrder[0], workgroupWalkOrder[1], workgroupWalkOrder[2]}},
+                         hasKernelOnlyImages);
    }
    return offsetPerThreadData;
 }
--- a/runtime/helpers/per_thread_data.h
+++ b/runtime/helpers/per_thread_data.h
@@ -49,15 +49,8 @@ struct PerThreadDataHelper {
        uint32_t simd,
        uint32_t numChannels,
        const size_t localWorkSizes[3],
-        const std::array<uint8_t, 3> &workgroupWalkOrder);
-
-    static size_t sendPerThreadData(
-        LinearStream &indirectHeap,
-        uint32_t simd,
-        uint32_t numChannels,
-        const size_t localWorkSizes[3]) {
-        return sendPerThreadData(indirectHeap, simd, numChannels, localWorkSizes, std::array<uint8_t, 3>{{0, 1, 2}});
-    }
+        const std::array<uint8_t, 3> &workgroupWalkOrder,
+        bool hasKernelOnlyImages);

    static inline uint32_t getNumLocalIdChannels(const iOpenCL::SPatchThreadPayload &threadPayload) {
        return threadPayload.LocalIDXPresent +