diff --git a/runtime/command_queue/local_id_gen.cpp b/runtime/command_queue/local_id_gen.cpp index e02aa53c15..9cae96dbd0 100644 --- a/runtime/command_queue/local_id_gen.cpp +++ b/runtime/command_queue/local_id_gen.cpp @@ -57,9 +57,12 @@ LocalIDHelper::LocalIDHelper() { LocalIDHelper LocalIDHelper::initializer; //traditional function to generate local IDs -void generateLocalIDs(void *buffer, uint16_t simd, const std::array &localWorkgroupSize, const std::array &dimensionsOrder) { +void generateLocalIDs(void *buffer, uint16_t simd, const std::array &localWorkgroupSize, const std::array &dimensionsOrder, bool hasKernelOnlyImages) { auto threadsPerWorkGroup = static_cast(getThreadsPerWG(simd, localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2])); - if (simd == 32) { + bool use4x4Layout = hasKernelOnlyImages && isCompatibleWith4x4Layout(localWorkgroupSize, dimensionsOrder, simd); + if (use4x4Layout) { + generateLocalIDsWith4x4Layout(buffer, localWorkgroupSize, simd); + } else if (simd == 32) { LocalIDHelper::generateSimd32(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder); } else if (simd == 16) { LocalIDHelper::generateSimd16(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder); @@ -67,4 +70,57 @@ void generateLocalIDs(void *buffer, uint16_t simd, const std::array LocalIDHelper::generateSimd8(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder); } } + +bool isCompatibleWith4x4Layout(const std::array &localWorkgroupSize, const std::array &dimensionsOrder, uint16_t simd) { + uint8_t rowWidth = simd == 32u ? 32u : 16u; + uint8_t xDelta = simd == 8u ? 2u : 4u; + uint8_t yDelta = simd == 8u || localWorkgroupSize.at(1) == 4u ? 4u : rowWidth / xDelta; + return dimensionsOrder.at(0) == 0 && + dimensionsOrder.at(1) == 1 && + (localWorkgroupSize.at(0) & (xDelta - 1)) == 0 && + (localWorkgroupSize.at(1) & (yDelta - 1)) == 0; +} + +inline void generateLocalIDsWith4x4Layout(void *b, const std::array &localWorkgroupSize, uint16_t simd) { + uint8_t rowWidth = simd == 32u ? 32u : 16u; + uint8_t xDelta = simd == 8u ? 2u : 4u; // difference between corresponding values in consecutive X rows + uint8_t yDelta = simd == 8u || localWorkgroupSize.at(1) == 4u ? 4u : rowWidth / xDelta; // difference between corresponding values in consecutive Y rows + std::array replicationFactors{{static_cast(localWorkgroupSize.at(0) / xDelta), + static_cast(localWorkgroupSize.at(1) / yDelta), + static_cast(localWorkgroupSize.at(2))}}; + bool earlyGrowX = replicationFactors.at(1) == 1 && simd == 32u && replicationFactors.at(0) > 1; + bool earlyGrowZ = replicationFactors.at(1) == 1 && simd == 32u && !earlyGrowX && replicationFactors.at(2) > 1; + auto buffer = reinterpret_cast(b); + uint16_t offset = 0u; + for (uint16_t z = 0u; z < replicationFactors.at(2); z++) { + for (uint16_t y = 0u; y < replicationFactors.at(1); y++) { + for (uint16_t x = 0u; x < replicationFactors.at(0); x++) { + // row for X + for (uint8_t i = 0u; i < simd; i++) { + if (earlyGrowX && i == yDelta * xDelta) { + x++; + } + auto xValue = xDelta * x + (i & (xDelta - 1)); + buffer[offset + i] = xValue & (localWorkgroupSize.at(0) - 1); + } + offset += rowWidth; + // row for Y + for (uint8_t i = 0u; i < simd; i++) { + auto yValue = yDelta * y + i / xDelta; + buffer[offset + i] = yValue & (localWorkgroupSize.at(1) - 1); + } + offset += rowWidth; + // row for Z + for (uint8_t i = 0u; i < simd; i++) { + if (earlyGrowZ && i == yDelta * xDelta) { + z++; + } + auto zValue = z; + buffer[offset + i] = zValue & (localWorkgroupSize.at(2) - 1); + } + offset += rowWidth; + } + } + } +} } // namespace OCLRT diff --git a/runtime/command_queue/local_id_gen.h b/runtime/command_queue/local_id_gen.h index c4d4ca225a..454a68a9a5 100644 --- a/runtime/command_queue/local_id_gen.h +++ b/runtime/command_queue/local_id_gen.h @@ -79,6 +79,8 @@ void generateLocalIDsSimd(void *b, const std::array &localWorkgroup const std::array &dimensionsOrder); void generateLocalIDs(void *buffer, uint16_t simd, const std::array &localWorkgroupSize, - const std::array &dimensionsOrder); + const std::array &dimensionsOrder, bool hasKernelOnlyImages); +void generateLocalIDsWith4x4Layout(void *b, const std::array &localWorkgroupSize, uint16_t simd); +bool isCompatibleWith4x4Layout(const std::array &localWorkgroupSize, const std::array &dimensionsOrder, uint16_t simd); } // namespace OCLRT \ No newline at end of file diff --git a/runtime/helpers/aligned_memory.h b/runtime/helpers/aligned_memory.h index 6377430d4d..85c7102eea 100644 --- a/runtime/helpers/aligned_memory.h +++ b/runtime/helpers/aligned_memory.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, Intel Corporation + * Copyright (c) 2017 - 2018, Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -28,6 +28,8 @@ #include #include #include +#include +#include #ifdef _MSC_VER #define ALIGNAS(x) __declspec(align(x)) @@ -112,3 +114,6 @@ template inline bool isAligned(T *ptr) { return (reinterpret_cast(ptr) & (alignof(T) - 1)) == 0; } +inline auto allocateAlignedMemory(size_t bytes, size_t alignment) { + return std::unique_ptr>(alignedMalloc(bytes, alignment), alignedFree); +} diff --git a/runtime/helpers/kernel_commands.inl b/runtime/helpers/kernel_commands.inl index a8fdae5742..be18ed5a3c 100644 --- a/runtime/helpers/kernel_commands.inl +++ b/runtime/helpers/kernel_commands.inl @@ -375,7 +375,8 @@ size_t KernelCommandsHelper::sendIndirectState( simd, numChannels, localWorkSize, - kernel.getKernelInfo().workgroupDimensionsOrder); + kernel.getKernelInfo().workgroupDimensionsOrder, + kernel.usesOnlyImages()); // send interface descriptor data auto localWorkItems = localWorkSize[0] * localWorkSize[1] * localWorkSize[2]; diff --git a/runtime/helpers/per_thread_data.cpp b/runtime/helpers/per_thread_data.cpp index 732caa7574..1d270602b9 100644 --- a/runtime/helpers/per_thread_data.cpp +++ b/runtime/helpers/per_thread_data.cpp @@ -33,7 +33,8 @@ size_t PerThreadDataHelper::sendPerThreadData( uint32_t simd, uint32_t numChannels, const size_t localWorkSizes[3], - const std::array &workgroupWalkOrder) { + const std::array &workgroupWalkOrder, + bool hasKernelOnlyImages) { auto offsetPerThreadData = indirectHeap.getUsed(); if (numChannels) { auto localWorkSize = localWorkSizes[0] * localWorkSizes[1] * localWorkSizes[2]; @@ -46,7 +47,8 @@ size_t PerThreadDataHelper::sendPerThreadData( std::array{{static_cast(localWorkSizes[0]), static_cast(localWorkSizes[1]), static_cast(localWorkSizes[2])}}, - std::array{{workgroupWalkOrder[0], workgroupWalkOrder[1], workgroupWalkOrder[2]}}); + std::array{{workgroupWalkOrder[0], workgroupWalkOrder[1], workgroupWalkOrder[2]}}, + hasKernelOnlyImages); } return offsetPerThreadData; } diff --git a/runtime/helpers/per_thread_data.h b/runtime/helpers/per_thread_data.h index e8ba8cfd4e..99a9884286 100644 --- a/runtime/helpers/per_thread_data.h +++ b/runtime/helpers/per_thread_data.h @@ -49,15 +49,8 @@ struct PerThreadDataHelper { uint32_t simd, uint32_t numChannels, const size_t localWorkSizes[3], - const std::array &workgroupWalkOrder); - - static size_t sendPerThreadData( - LinearStream &indirectHeap, - uint32_t simd, - uint32_t numChannels, - const size_t localWorkSizes[3]) { - return sendPerThreadData(indirectHeap, simd, numChannels, localWorkSizes, std::array{{0, 1, 2}}); - } + const std::array &workgroupWalkOrder, + bool hasKernelOnlyImages); static inline uint32_t getNumLocalIdChannels(const iOpenCL::SPatchThreadPayload &threadPayload) { return threadPayload.LocalIDXPresent + diff --git a/unit_tests/command_queue/local_id_tests.cpp b/unit_tests/command_queue/local_id_tests.cpp index d14101ad8b..0da0ce918f 100644 --- a/unit_tests/command_queue/local_id_tests.cpp +++ b/unit_tests/command_queue/local_id_tests.cpp @@ -248,20 +248,20 @@ struct LocalIDFixture : public ::testing::TestWithParam{{static_cast(localWorkSizeX), static_cast(localWorkSizeY), static_cast(localWorkSizeZ)}}, - std::array{{0, 1, 2}}); + std::array{{0, 1, 2}}, false); validateIDWithinLimits(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ); } TEST_P(LocalIDFixture, checkAllWorkItemsCovered) { generateLocalIDs(buffer, simd, std::array{{static_cast(localWorkSizeX), static_cast(localWorkSizeY), static_cast(localWorkSizeZ)}}, - std::array{{0, 1, 2}}); + std::array{{0, 1, 2}}, false); validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ); } TEST_P(LocalIDFixture, WhenWalkOrderIsXyzThenProperLocalIdsAreGenerated) { auto dimensionsOrder = std::array{{0, 1, 2}}; generateLocalIDs(buffer, simd, std::array{{static_cast(localWorkSizeX), static_cast(localWorkSizeY), static_cast(localWorkSizeZ)}}, - dimensionsOrder); + dimensionsOrder, false); validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ); validateWalkOrder(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, dimensionsOrder); } @@ -269,7 +269,7 @@ TEST_P(LocalIDFixture, WhenWalkOrderIsXyzThenProperLocalIdsAreGenerated) { TEST_P(LocalIDFixture, WhenWalkOrderIsYxzThenProperLocalIdsAreGenerated) { auto dimensionsOrder = std::array{{1, 0, 2}}; generateLocalIDs(buffer, simd, std::array{{static_cast(localWorkSizeX), static_cast(localWorkSizeY), static_cast(localWorkSizeZ)}}, - dimensionsOrder); + dimensionsOrder, false); validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ); validateWalkOrder(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, dimensionsOrder); } @@ -277,7 +277,7 @@ TEST_P(LocalIDFixture, WhenWalkOrderIsYxzThenProperLocalIdsAreGenerated) { TEST_P(LocalIDFixture, WhenWalkOrderIsZyxThenProperLocalIdsAreGenerated) { auto dimensionsOrder = std::array{{2, 1, 0}}; generateLocalIDs(buffer, simd, std::array{{static_cast(localWorkSizeX), static_cast(localWorkSizeY), static_cast(localWorkSizeZ)}}, - dimensionsOrder); + dimensionsOrder, false); validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ); validateWalkOrder(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, dimensionsOrder); } @@ -296,6 +296,199 @@ TEST_P(LocalIDFixture, sizeCalculationLocalIDs) { EXPECT_EQ(numGRFsExpected * sizeGRF, sizeTotalPerThreadData); } +using LocalIds4x4LayoutTest = ::testing::TestWithParam; + +TEST(LocalIds4x4LayoutTest, given4x4x1LocalWorkSizeWithDefaultDimensionsOrderWhenCheck2x4CompatibilityThenReturnTrue) { + std::array localWorkSize{{4u, 4u, 1u}}; + std::array dimensionsOrder = {{0u, 1u, 2u}}; + EXPECT_TRUE(isCompatibleWith4x4Layout(localWorkSize, dimensionsOrder, 16)); +} + +TEST(LocalIds4x4LayoutTest, givenNonCompatible4x4x1LocalWorkSizeWithDefaultDimensionsOrderWhenCheck2x4CompatibilityThenReturnFalse) { + std::array dimensionsOrder = {{0u, 1u, 2u}}; + EXPECT_FALSE(isCompatibleWith4x4Layout({{2u, 5u, 1u}}, dimensionsOrder, 8)); + EXPECT_FALSE(isCompatibleWith4x4Layout({{1u, 4u, 1u}}, dimensionsOrder, 8)); +} +TEST(LocalIds4x4LayoutTest, given4x4x1LocalWorkSizeWithNonDefaultDimensionsOrderWhenCheck2x4CompatibilityThenReturnFalse) { + std::array localWorkSize{{2u, 4u, 1u}}; + EXPECT_FALSE(isCompatibleWith4x4Layout(localWorkSize, {{0, 2, 1}}, 8)); + EXPECT_FALSE(isCompatibleWith4x4Layout(localWorkSize, {{1, 0, 2}}, 8)); + EXPECT_FALSE(isCompatibleWith4x4Layout(localWorkSize, {{1, 2, 0}}, 8)); + EXPECT_FALSE(isCompatibleWith4x4Layout(localWorkSize, {{2, 0, 1}}, 8)); + EXPECT_FALSE(isCompatibleWith4x4Layout(localWorkSize, {{2, 1, 0}}, 8)); +} + +TEST_P(LocalIds4x4LayoutTest, givenLWS4x4x1WhenGenerateLocalIdsThenHasKernelImagesOnlyFlagDoesntMatter) { + uint16_t simd = GetParam(); + uint8_t rowWidth = simd == 32 ? 32 : 16; + uint16_t xDelta = simd == 8u ? 2u : 4u; + std::array localWorkSize{{xDelta, 4u, 1u}}; + uint16_t totalLocalWorkSize = 4u * xDelta; + auto dimensionsOrder = std::array{{0u, 1u, 2u}}; + + auto elemsInBuffer = rowWidth * 3u; + auto size = elemsInBuffer * sizeof(uint16_t); + + auto alignedMemory1 = allocateAlignedMemory(size, 32); + auto buffer1 = reinterpret_cast(alignedMemory1.get()); + memset(buffer1, 0xff, size); + + auto alignedMemory2 = allocateAlignedMemory(size, 32); + auto buffer2 = reinterpret_cast(alignedMemory2.get()); + memset(buffer2, 0xff, size); + + generateLocalIDs(buffer1, simd, localWorkSize, dimensionsOrder, false); + generateLocalIDs(buffer2, simd, localWorkSize, dimensionsOrder, true); + + for (auto i = 0u; i < elemsInBuffer / rowWidth; i++) { + for (auto j = 0u; j < rowWidth; j++) { + if (j < totalLocalWorkSize) { + auto offset = (i * rowWidth + j) * sizeof(uint16_t); + auto cmpValue = memcmp(ptrOffset(buffer1, offset), ptrOffset(buffer2, offset), sizeof(uint16_t)); + EXPECT_EQ(0, cmpValue); + } + } + } +} + +TEST_P(LocalIds4x4LayoutTest, givenLWS4x4x2WhenGenerateLocalIdsWithKernelWithOnlyImagesThenApplies4x4Layout) { + uint16_t simd = GetParam(); + uint8_t rowWidth = simd == 32 ? 32 : 16; + uint16_t xDelta = simd == 8u ? 2u : 4u; + uint16_t zDelta = simd == 32u ? 2u : 1u; + std::array localWorkSize{4u, 4u, 2u}; + auto dimensionsOrder = std::array{{0u, 1u, 2u}}; + auto elemsInBuffer = 3u * localWorkSize.at(0) * localWorkSize.at(1) * localWorkSize.at(2); + if (simd == 8u) { + elemsInBuffer *= 2; + } + auto size = elemsInBuffer * sizeof(uint16_t); + auto alignedMemory = allocateAlignedMemory(size, 32); + auto buffer = reinterpret_cast(alignedMemory.get()); + memset(buffer, 0xff, size); + EXPECT_TRUE(isCompatibleWith4x4Layout(localWorkSize, dimensionsOrder, simd)); + generateLocalIDs(buffer, simd, localWorkSize, dimensionsOrder, true); + + auto numRows = elemsInBuffer / rowWidth; + auto numGrfs = numRows / 3u; + + for (auto i = 0u; i < numGrfs; i++) { + + // validate X row + uint16_t baseX = buffer[i * 3 * rowWidth]; + uint16_t currentX = baseX; + for (int j = 1; j < simd; j++) { + currentX = baseX + ((currentX + 1) & (xDelta - 1)); + EXPECT_EQ(buffer[i * 3 * rowWidth + j], currentX); + } + + // validate Y row + for (int j = 0; j < simd; j++) { + uint16_t expectedY = ((j / xDelta) & 0b11); + EXPECT_EQ(buffer[i * 3 * rowWidth + rowWidth + j], expectedY); + } + + // validate Z row + for (int j = 0; j < simd; j++) { + uint16_t expectedZ = 2 * i / numGrfs + j / (simd / zDelta); //early grow Z + EXPECT_EQ(buffer[i * 3 * rowWidth + 2 * rowWidth + j], expectedZ); + } + } +} + +TEST_P(LocalIds4x4LayoutTest, givenLWS8x4x2WhenGenerateLocalIdsWithKernelWithOnlyImagesThenApplies4x4Layout) { + uint16_t simd = GetParam(); + uint8_t rowWidth = simd == 32 ? 32 : 16; + uint16_t xDelta = simd == 8u ? 2u : 4u; + std::array localWorkSize{8u, 4u, 2u}; + auto dimensionsOrder = std::array{{0u, 1u, 2u}}; + auto elemsInBuffer = 3u * localWorkSize.at(0) * localWorkSize.at(1) * localWorkSize.at(2); + if (simd == 8u) { + elemsInBuffer *= 2; + } + auto size = elemsInBuffer * sizeof(uint16_t); + auto alignedMemory = allocateAlignedMemory(size, 32); + auto buffer = reinterpret_cast(alignedMemory.get()); + memset(buffer, 0xff, size); + EXPECT_TRUE(isCompatibleWith4x4Layout(localWorkSize, dimensionsOrder, simd)); + generateLocalIDs(buffer, simd, localWorkSize, dimensionsOrder, true); + + auto numRows = elemsInBuffer / rowWidth; + auto numGrfs = numRows / 3u; + + for (auto i = 0u; i < numGrfs; i++) { + + // validate X row + uint16_t baseX = buffer[i * 3 * rowWidth]; + uint16_t currentX = baseX; + for (int j = 1; j < simd; j++) { + if (j == 16) { + //early grow X + baseX += xDelta; + } + currentX = baseX + ((currentX + 1) & (xDelta - 1)); + EXPECT_EQ(buffer[i * 3 * rowWidth + j], currentX); + } + + // validate Y row + for (int j = 0; j < simd; j++) { + uint16_t expectedY = ((j / xDelta) & 0b11); + EXPECT_EQ(buffer[i * 3 * rowWidth + rowWidth + j], expectedY); + } + + // validate Z row + for (int j = 0; j < simd; j++) { + uint16_t expectedZ = 2 * i / numGrfs; + EXPECT_EQ(buffer[i * 3 * rowWidth + 2 * rowWidth + j], expectedZ); + } + } +} + +TEST_P(LocalIds4x4LayoutTest, givenLWS8x8x2WhenGenerateLocalIdsWithKernelWithOnlyImagesThenApplies4x4Layout) { + uint16_t simd = GetParam(); + uint8_t rowWidth = simd == 32 ? 32 : 16; + uint16_t xDelta = simd == 8u ? 2u : 4u; + std::array localWorkSize{8u, 8u, 2u}; + auto dimensionsOrder = std::array{{0u, 1u, 2u}}; + auto elemsInBuffer = 3u * localWorkSize.at(0) * localWorkSize.at(1) * localWorkSize.at(2); + if (simd == 8u) { + elemsInBuffer *= 2; + } + auto size = elemsInBuffer * sizeof(uint16_t); + auto alignedMemory = allocateAlignedMemory(size, 32); + auto buffer = reinterpret_cast(alignedMemory.get()); + memset(buffer, 0xff, size); + EXPECT_TRUE(isCompatibleWith4x4Layout(localWorkSize, dimensionsOrder, simd)); + generateLocalIDs(buffer, simd, localWorkSize, dimensionsOrder, true); + + auto numRows = elemsInBuffer / rowWidth; + auto numGrfs = numRows / 3u; + + for (auto i = 0u; i < numGrfs; i++) { + + // validate X row + uint16_t baseX = buffer[i * 3 * rowWidth]; + uint16_t currentX = baseX; + for (int j = 1; j < simd; j++) { + currentX = baseX + ((currentX + 1) & (xDelta - 1)); + EXPECT_EQ(buffer[i * 3 * rowWidth + j], currentX); + } + + // validate Y row + uint16_t baseY = buffer[i * 3 * rowWidth + rowWidth]; + for (int j = 0; j < simd; j++) { + uint16_t expectedY = baseY + ((j / xDelta) & 0b111); + EXPECT_EQ(buffer[i * 3 * rowWidth + rowWidth + j], expectedY); + } + + // validate Z row + for (int j = 0; j < simd; j++) { + uint16_t expectedZ = 2 * i / numGrfs; + EXPECT_EQ(buffer[i * 3 * rowWidth + 2 * rowWidth + j], expectedZ); + } + } +} + #define SIMDParams ::testing::Values(8, 16, 32) #if HEAVY_DUTY_TESTING #define LWSXParams ::testing::Values(1, 7, 8, 9, 15, 16, 17, 31, 32, 33, 64, 128, 256) @@ -308,6 +501,7 @@ TEST_P(LocalIDFixture, sizeCalculationLocalIDs) { #endif INSTANTIATE_TEST_CASE_P(AllCombinations, LocalIDFixture, ::testing::Combine(SIMDParams, LWSXParams, LWSYParams, LWSZParams)); +INSTANTIATE_TEST_CASE_P(4x4LWSLayoutTests, LocalIds4x4LayoutTest, SIMDParams); // To debug a specific configuration replace the list of Values with specific values. // NOTE: You'll need a unique test prefix diff --git a/unit_tests/helpers/kernel_commands_tests.cpp b/unit_tests/helpers/kernel_commands_tests.cpp index de7ecbd794..f03a3dc754 100644 --- a/unit_tests/helpers/kernel_commands_tests.cpp +++ b/unit_tests/helpers/kernel_commands_tests.cpp @@ -514,7 +514,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, whenSendingIndirectStateThenKern auto expectedLocalIds = alignedMalloc(expectedIohSize, 64); generateLocalIDs(expectedLocalIds, modifiedKernelInfo.getMaxSimdSize(), std::array{{localWorkSizeX, localWorkSizeY, localWorkSizeZ}}, - std::array{{modifiedKernelInfo.workgroupDimensionsOrder[0], modifiedKernelInfo.workgroupDimensionsOrder[1], modifiedKernelInfo.workgroupDimensionsOrder[2]}}); + std::array{{modifiedKernelInfo.workgroupDimensionsOrder[0], modifiedKernelInfo.workgroupDimensionsOrder[1], modifiedKernelInfo.workgroupDimensionsOrder[2]}}, false); EXPECT_EQ(0, memcmp(expectedLocalIds, ioh.getCpuBase(), expectedIohSize)); alignedFree(expectedLocalIds); } diff --git a/unit_tests/helpers/per_thread_data_tests.cpp b/unit_tests/helpers/per_thread_data_tests.cpp index 1d29ae745f..a9d157e798 100644 --- a/unit_tests/helpers/per_thread_data_tests.cpp +++ b/unit_tests/helpers/per_thread_data_tests.cpp @@ -73,6 +73,8 @@ struct PerThreadDataTests : public DeviceFixture, alignedFree(indirectHeapMemory); DeviceFixture::TearDown(); } + + const std::array workgroupWalkOrder = {{0, 1, 2}}; uint32_t simd; uint32_t numChannels; uint32_t kernelIsa[32]; @@ -107,7 +109,9 @@ HWTEST_F(PerThreadDataXYZTests, sendPerThreadData_256x1x1) { indirectHeap, simd, numChannels, - localWorkSizes); + localWorkSizes, + workgroupWalkOrder, + false); auto expectedPerThreadDataSizeTotal = PerThreadDataHelper::getPerThreadDataSizeTotal(simd, numChannels, localWorkSize); size_t sizeConsumed = indirectHeap.getUsed() - offsetPerThreadData; @@ -123,7 +127,9 @@ HWTEST_F(PerThreadDataXYZTests, sendPerThreadData_2x4x8) { indirectHeap, simd, numChannels, - localWorkSizes); + localWorkSizes, + workgroupWalkOrder, + false); size_t sizeConsumed = indirectHeap.getUsed() - offsetPerThreadData; EXPECT_EQ(64u * (3u * 2u * 4u * 8u) / 32u, sizeConsumed); @@ -174,7 +180,9 @@ HWTEST_F(PerThreadDataNoIdsTests, sendPerThreadDataDoesntSendAnyData) { indirectHeap, simd, numChannels, - localWorkSizes); + localWorkSizes, + workgroupWalkOrder, + false); size_t sizeConsumed = indirectHeap.getUsed() - offsetPerThreadData; EXPECT_EQ(0u, sizeConsumed); @@ -245,7 +253,9 @@ TEST(PerThreadDataTest, generateLocalIDs) { stream, simd, numChannels, - localWorkSizes); + localWorkSizes, + {{0, 1, 2}}, + false); // Check if buffer overrun happend, only first sizePerThreadDataTotal bytes can be overwriten, following should be same as reference. for (auto i = sizePerThreadDataTotal; i < sizeOverSizedBuffer; i += sizePerThreadDataTotal) {