add support for packed simd1 dispatch

Change-Id: I3f2bf8e62e0a38d358fb87f02c88c387c874f6b3 Signed-off-by: Maciej Plewka <maciej.plewka@intel.com>
2026-01-03 14:55:24 +08:00 · 2019-10-23 09:36:37 +02:00
parent 2eafa99342
commit 51dcf2b6d2
19 changed files with 233 additions and 5 deletions
--- a/unit_tests/command_queue/dispatch_walker_tests.cpp
+++ b/unit_tests/command_queue/dispatch_walker_tests.cpp
@@ -124,6 +124,27 @@ HWTEST_F(DispatchWalkerTest, WhenGettingComputeDimensionsThenCorrectNumberOfDime
    EXPECT_EQ(3u, computeDimensions(workItems3D));
 }

+HWTEST_F(DispatchWalkerTest, givenSimd1WhenSetGpgpuWalkerThreadDataThenSimdInWalkerIsSetTo32Value) {
+    uint32_t pCmdBuffer[1024];
+    MockGraphicsAllocation gfxAllocation((void *)pCmdBuffer, sizeof(pCmdBuffer));
+    LinearStream linearStream(&gfxAllocation);
+
+    using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
+    WALKER_TYPE *computeWalker = static_cast<WALKER_TYPE *>(linearStream.getSpace(sizeof(WALKER_TYPE)));
+    *computeWalker = FamilyType::cmdInitGpgpuWalker;
+
+    size_t globalOffsets[] = {0, 0, 0};
+    size_t startWorkGroups[] = {0, 0, 0};
+    size_t numWorkGroups[] = {1, 1, 1};
+    size_t localWorkSizesIn[] = {32, 1, 1};
+    uint32_t simd = 1;
+    iOpenCL::SPatchThreadPayload threadPayload;
+
+    GpgpuWalkerHelper<FamilyType>::setGpgpuWalkerThreadData(
+        computeWalker, globalOffsets, startWorkGroups, numWorkGroups, localWorkSizesIn, simd, 3, true, false, threadPayload, 5u);
+    EXPECT_EQ(computeWalker->getSimdSize(), 32 >> 4);
+}
+
 HWTEST_F(DispatchWalkerTest, WhenDispatchingWalkerThenCommandStreamMemoryIsntChanged) {
    MockKernel kernel(program.get(), kernelInfo, *pDevice);
    ASSERT_EQ(CL_SUCCESS, kernel.initialize());
--- a/unit_tests/command_queue/local_id_tests.cpp
+++ b/unit_tests/command_queue/local_id_tests.cpp
@@ -59,6 +59,12 @@ TEST(LocalID, PerThreadSizeLocalIDs_SIMD32) {
    EXPECT_EQ(6 * sizeof(GRF), getPerThreadSizeLocalIDs(simd));
 }

+TEST(LocalID, PerThreadSizeLocalIDs_SIMD1) {
+    uint32_t simd = 1;
+
+    EXPECT_EQ(sizeof(GRF), getPerThreadSizeLocalIDs(simd));
+}
+
 struct LocalIDFixture : public ::testing::TestWithParam<std::tuple<int, int, int, int>> {
    void SetUp() override {
        simd = std::get<0>(GetParam());
--- a/unit_tests/helpers/per_thread_data_tests.cpp
+++ b/unit_tests/helpers/per_thread_data_tests.cpp
@@ -252,3 +252,50 @@ TEST(PerThreadDataTest, generateLocalIDs) {
    alignedFree(buffer);
    alignedFree(reference);
 }
+
+TEST(PerThreadDataTest, givenSimdEqualOneWhenSetingLocalIdsInPerThreadDataThenIdsAreSetInCorrectOrder) {
+    uint32_t simd = 1;
+    uint32_t numChannels = 3;
+    uint32_t localWorkSize = 24;
+
+    size_t localWorkSizes[3] = {3, 4, 2};
+
+    auto sizePerThreadDataTotal = PerThreadDataHelper::getPerThreadDataSizeTotal(simd, numChannels, localWorkSize);
+
+    auto sizeOverSizedBuffer = sizePerThreadDataTotal * 4;
+    auto buffer = static_cast<char *>(alignedMalloc(sizeOverSizedBuffer, 16));
+    memset(buffer, 0, sizeOverSizedBuffer);
+
+    // Setup reference filled with zeros
+    auto reference = static_cast<char *>(alignedMalloc(sizePerThreadDataTotal, 16));
+    memset(reference, 0, sizePerThreadDataTotal);
+
+    LinearStream stream(buffer, sizeOverSizedBuffer / 2);
+    PerThreadDataHelper::sendPerThreadData(
+        stream,
+        simd,
+        numChannels,
+        localWorkSizes,
+        {{0, 1, 2}},
+        false);
+
+    auto bufferPtr = buffer;
+    for (uint16_t i = 0; i < localWorkSizes[2]; i++) {
+        for (uint16_t j = 0; j < localWorkSizes[1]; j++) {
+            for (uint16_t k = 0; k < localWorkSizes[0]; k++) {
+                uint16_t ids[] = {k, j, i};
+                int result = memcmp(bufferPtr, ids, sizeof(uint16_t) * 3);
+                EXPECT_EQ(0, result);
+                bufferPtr += sizeof(GRF);
+            }
+        }
+    }
+    // Check if buffer overrun happend, only first sizePerThreadDataTotal bytes can be overwriten, following should be same as reference.
+    for (auto i = sizePerThreadDataTotal; i < sizeOverSizedBuffer; i += sizePerThreadDataTotal) {
+        int result = memcmp(buffer + i, reference, sizePerThreadDataTotal);
+        EXPECT_EQ(0, result);
+    }
+
+    alignedFree(buffer);
+    alignedFree(reference);
+}
--- a/unit_tests/kernel/kernel_tests.cpp
+++ b/unit_tests/kernel/kernel_tests.cpp
@@ -2163,6 +2163,16 @@ TEST_F(KernelExecutionEnvironmentTest, getMaxSimdReturns1WhenExecutionEnvironmen
    this->pKernelInfo->patchInfo.executionEnvironment = oldExcEnv;
 }

+TEST_F(KernelExecutionEnvironmentTest, getMaxSimdReturns1WhenLargestCompilledSimdSizeEqualOne) {
+
+    executionEnvironment.LargestCompiledSIMDSize = 1;
+
+    auto oldExcEnv = this->pKernelInfo->patchInfo.executionEnvironment;
+
+    EXPECT_EQ(1U, this->pKernelInfo->getMaxSimdSize());
+    this->pKernelInfo->patchInfo.executionEnvironment = oldExcEnv;
+}
+
 TEST_F(KernelExecutionEnvironmentTest, getMaxRequiredWorkGroupSizeWhenCompiledWorkGroupSizeIsZero) {
    auto maxWorkGroupSize = pDevice->getDeviceInfo().maxWorkGroupSize;
    auto oldRequiredWorkGroupSizeX = this->pKernelInfo->patchInfo.executionEnvironment->RequiredWorkGroupSizeX;