542 lines
23 KiB
C++
542 lines
23 KiB
C++
/*
|
|
* Copyright (C) 2018-2024 Intel Corporation
|
|
*
|
|
* SPDX-License-Identifier: MIT
|
|
*
|
|
*/
|
|
|
|
#include "shared/source/helpers/aligned_memory.h"
|
|
#include "shared/source/helpers/basic_math.h"
|
|
#include "shared/source/helpers/gfx_core_helper.h"
|
|
#include "shared/source/helpers/local_id_gen.h"
|
|
#include "shared/source/helpers/ptr_math.h"
|
|
#include "shared/test/common/helpers/default_hw_info.h"
|
|
#include "shared/test/common/helpers/unit_test_helper.h"
|
|
#include "shared/test/common/mocks/mock_execution_environment.h"
|
|
#include "shared/test/common/test_macros/hw_test.h"
|
|
|
|
#include <algorithm>
|
|
#include <cstdint>
|
|
|
|
using namespace NEO;
|
|
|
|
using LocalIdTests = ::testing::Test;
|
|
|
|
HWTEST_F(LocalIdTests, GivenSimd8WhenGettingGrfsPerThreadThenOneIsReturned) {
|
|
uint32_t simd = 8;
|
|
EXPECT_EQ(1u, getNumGrfsPerLocalIdCoordinate(simd, 32));
|
|
}
|
|
|
|
HWTEST_F(LocalIdTests, GivenSimd16WhenGettingGrfsPerThreadThenOneIsReturned) {
|
|
uint32_t simd = 16;
|
|
EXPECT_EQ(1u, getNumGrfsPerLocalIdCoordinate(simd, 32));
|
|
}
|
|
|
|
HWTEST_F(LocalIdTests, GivenSimd32WhenGettingGrfsPerThreadThenTwoIsReturned) {
|
|
uint32_t simd = 32;
|
|
EXPECT_EQ(2u, getNumGrfsPerLocalIdCoordinate(simd, 32));
|
|
}
|
|
|
|
HWTEST_F(LocalIdTests, GivenSimd1WhenGettingGrfsPerThreadThenOneIsReturned) {
|
|
uint32_t simd = 1;
|
|
EXPECT_EQ(1u, getNumGrfsPerLocalIdCoordinate(simd, 32));
|
|
}
|
|
|
|
HWTEST_F(LocalIdTests, GivenSimd32AndNon32GrfSizeWhenGettingGrfsPerThreadThenTwoIsReturned) {
|
|
uint32_t simd = 32;
|
|
EXPECT_EQ(1u, getNumGrfsPerLocalIdCoordinate(simd, 33));
|
|
}
|
|
|
|
TEST(LocalID, GivenSimd32AndLws33WhenGettingThreadsPerWorkgroupThenTwoIsReturned) {
|
|
uint32_t lws = 33;
|
|
uint32_t simd = 32;
|
|
EXPECT_EQ(2u, getThreadsPerWG(simd, lws));
|
|
}
|
|
|
|
TEST(LocalID, GivenSimd8WhenGettingPerThreadSizeLocalIdsThenValueIsThreeTimesGrfSize) {
|
|
uint32_t simd = 8;
|
|
uint32_t grfSize = 32;
|
|
|
|
// 3 channels (x,y,z) * 1 GRFs per thread (@SIMD8)
|
|
EXPECT_EQ(3 * grfSize, getPerThreadSizeLocalIDs(simd, grfSize));
|
|
}
|
|
|
|
TEST(LocalID, GivenSimd16WhenGettingPerThreadSizeLocalIdsThenValueIsThreeTimesGrfSize) {
|
|
uint32_t simd = 16;
|
|
uint32_t grfSize = 32;
|
|
|
|
// 3 channels (x,y,z) * 1 GRFs per thread (@SIMD16)
|
|
EXPECT_EQ(3 * grfSize, getPerThreadSizeLocalIDs(simd, grfSize));
|
|
}
|
|
|
|
TEST(LocalID, GivenSimd8WhenGettingPerThreadSizeLocalIdsThenValueIsSixTimesGrfSize) {
|
|
uint32_t simd = 32;
|
|
uint32_t grfSize = 32;
|
|
|
|
// 3 channels (x,y,z) * 2 GRFs per thread (@SIMD32)
|
|
EXPECT_EQ(6 * grfSize, getPerThreadSizeLocalIDs(simd, grfSize));
|
|
}
|
|
|
|
TEST(LocalID, GivenSimd1WhenGettingPerThreadSizeLocalIdsThenValueIsEqualGrfSize) {
|
|
uint32_t simd = 1;
|
|
uint32_t grfSize = 32;
|
|
|
|
EXPECT_EQ(grfSize, getPerThreadSizeLocalIDs(simd, grfSize));
|
|
}
|
|
|
|
TEST(LocalID, WhenThreadsPerWgAreGeneratedThenCalculationsAreCorrect) {
|
|
const auto lws = 33u;
|
|
for (const auto &simd : {1u, 8u, 16u, 32u}) {
|
|
switch (simd) {
|
|
case 1u:
|
|
EXPECT_EQ(lws, getThreadsPerWG(simd, lws));
|
|
break;
|
|
case 32u:
|
|
EXPECT_EQ((lws + std::max(32u, simd) - 1) >> 5, getThreadsPerWG(simd, lws));
|
|
break;
|
|
case 8u:
|
|
EXPECT_EQ((lws + simd - 1) >> 3, getThreadsPerWG(simd, lws));
|
|
break;
|
|
case 16u:
|
|
EXPECT_EQ((lws + simd - 1) >> 4, getThreadsPerWG(simd, lws));
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
TEST(LocalIdTest, givenVariadicGrfSizeWhenLocalSizesAreEmittedThenUseFullRowSize) {
|
|
auto localIdsPtr = allocateAlignedMemory(3 * 64u, MemoryConstants::cacheLineSize);
|
|
|
|
uint16_t *localIdsView = reinterpret_cast<uint16_t *>(localIdsPtr.get());
|
|
std::array<uint16_t, 3u> localSizes = {{2u, 2u, 1u}};
|
|
std::array<uint8_t, 3u> dimensionsOrder = {{0u, 1u, 2u}};
|
|
|
|
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
|
|
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
|
|
generateLocalIDs(localIdsPtr.get(), 16u, localSizes, dimensionsOrder, false, 64u, rootDeviceEnvironment);
|
|
EXPECT_EQ(localIdsView[0], 0u);
|
|
EXPECT_EQ(localIdsView[1], 1u);
|
|
EXPECT_EQ(localIdsView[2], 0u);
|
|
EXPECT_EQ(localIdsView[3], 1u);
|
|
|
|
EXPECT_EQ(localIdsView[32], 0u);
|
|
EXPECT_EQ(localIdsView[33], 0u);
|
|
EXPECT_EQ(localIdsView[34], 1u);
|
|
EXPECT_EQ(localIdsView[35], 1u);
|
|
|
|
EXPECT_EQ(localIdsView[64], 0u);
|
|
EXPECT_EQ(localIdsView[65], 0u);
|
|
EXPECT_EQ(localIdsView[66], 0u);
|
|
EXPECT_EQ(localIdsView[67], 0u);
|
|
}
|
|
|
|
struct LocalIDFixture : ::testing::TestWithParam<std::tuple<int, int, int, int, int>> {
|
|
void SetUp() override {
|
|
simd = std::get<0>(GetParam());
|
|
grfSize = std::get<1>(GetParam());
|
|
localWorkSizeX = std::get<2>(GetParam());
|
|
localWorkSizeY = std::get<3>(GetParam());
|
|
localWorkSizeZ = std::get<4>(GetParam());
|
|
|
|
localWorkSize = localWorkSizeX * localWorkSizeY * localWorkSizeZ;
|
|
if (localWorkSize > 256) {
|
|
localWorkSizeY = std::min(256 / localWorkSizeX, localWorkSizeY);
|
|
localWorkSizeZ = std::min(256 / (localWorkSizeX * localWorkSizeY), localWorkSizeZ);
|
|
localWorkSize = localWorkSizeX * localWorkSizeY * localWorkSizeZ;
|
|
}
|
|
|
|
const auto bufferSize = 32 * 3 * 16 * sizeof(uint16_t);
|
|
buffer = reinterpret_cast<uint16_t *>(alignedMalloc(bufferSize, 32));
|
|
memset(buffer, 0xff, bufferSize);
|
|
}
|
|
|
|
void TearDown() override {
|
|
alignedFree(buffer);
|
|
}
|
|
|
|
void validateIDWithinLimits(uint32_t simd, uint32_t lwsX, uint32_t lwsY, uint32_t lwsZ, bool useFullRowSize) {
|
|
auto idsPerThread = simd;
|
|
|
|
// As per BackEnd HLD, SIMD32 has 32 localIDs per channel. SIMD8/16 has up to 16 localIDs.
|
|
auto skipPerThread = (simd == 32 || useFullRowSize) ? 32 : 16;
|
|
|
|
auto pBufferX = buffer;
|
|
auto pBufferY = pBufferX + skipPerThread;
|
|
auto pBufferZ = pBufferY + skipPerThread;
|
|
|
|
auto numWorkItems = lwsX * lwsY * lwsZ;
|
|
|
|
size_t itemIndex = 0;
|
|
while (numWorkItems > 0) {
|
|
EXPECT_LT(pBufferX[itemIndex], lwsX) << simd << " " << lwsX << " " << lwsY << " " << lwsZ;
|
|
EXPECT_LT(pBufferY[itemIndex], lwsY) << simd << " " << lwsX << " " << lwsY << " " << lwsZ;
|
|
EXPECT_LT(pBufferZ[itemIndex], lwsZ) << simd << " " << lwsX << " " << lwsY << " " << lwsZ;
|
|
++itemIndex;
|
|
if (idsPerThread == itemIndex) {
|
|
pBufferX += skipPerThread * 3;
|
|
pBufferY += skipPerThread * 3;
|
|
pBufferZ += skipPerThread * 3;
|
|
|
|
itemIndex = 0;
|
|
}
|
|
--numWorkItems;
|
|
}
|
|
}
|
|
|
|
void validateAllWorkItemsCovered(uint32_t simd, uint32_t lwsX, uint32_t lwsY, uint32_t lwsZ, bool useFullRow) {
|
|
auto idsPerThread = simd;
|
|
|
|
// As per BackEnd HLD, SIMD32 has 32 localIDs per channel. SIMD8/16 has up to 16 localIDs.
|
|
auto skipPerThread = (simd == 32 || useFullRow) ? 32 : 16;
|
|
|
|
auto pBufferX = buffer;
|
|
auto pBufferY = pBufferX + skipPerThread;
|
|
auto pBufferZ = pBufferY + skipPerThread;
|
|
|
|
auto numWorkItems = lwsX * lwsY * lwsZ;
|
|
|
|
// Initialize local ID hit table
|
|
uint32_t localIDHitTable[8];
|
|
memset(localIDHitTable, 0, sizeof(localIDHitTable));
|
|
|
|
size_t itemIndex = 0;
|
|
while (numWorkItems > 0) {
|
|
// Flatten out the IDs
|
|
auto workItem = pBufferX[itemIndex] + pBufferY[itemIndex] * lwsX + pBufferZ[itemIndex] * lwsX * lwsY;
|
|
ASSERT_LT(workItem, 256u);
|
|
|
|
// Look up in the hit table
|
|
auto &hitItem = localIDHitTable[workItem / 32];
|
|
auto hitBit = 1 << (workItem % 32);
|
|
|
|
// No double-hits
|
|
EXPECT_EQ(0u, hitItem & hitBit);
|
|
|
|
// Set that work item as hit
|
|
hitItem |= hitBit;
|
|
|
|
++itemIndex;
|
|
if (idsPerThread == itemIndex) {
|
|
pBufferX += skipPerThread * 3;
|
|
pBufferY += skipPerThread * 3;
|
|
pBufferZ += skipPerThread * 3;
|
|
|
|
itemIndex = 0;
|
|
}
|
|
--numWorkItems;
|
|
}
|
|
|
|
// All entries in hit table should be in form of n^2 - 1
|
|
for (uint32_t i : localIDHitTable) {
|
|
EXPECT_EQ(0u, i & (i + 1));
|
|
}
|
|
}
|
|
|
|
void validateWalkOrder(uint32_t simd, uint32_t localWorkgroupSizeX, uint32_t localWorkgroupSizeY, uint32_t localWorkgroupSizeZ,
|
|
const std::array<uint8_t, 3> &dimensionsOrder) {
|
|
std::array<uint8_t, 3> walkOrder = {};
|
|
for (uint32_t i = 0; i < 3; ++i) {
|
|
// inverts the walk order mapping (from DIM_ID->ORDER_ID to ORDER_ID->DIM_ID)
|
|
walkOrder[dimensionsOrder[i]] = i;
|
|
}
|
|
|
|
auto skipPerThread = simd == 32 ? 32 : 16;
|
|
|
|
auto pBufferX = buffer;
|
|
auto pBufferY = pBufferX + skipPerThread;
|
|
auto pBufferZ = pBufferY + skipPerThread;
|
|
decltype(pBufferX) ids[] = {pBufferX, pBufferY, pBufferZ};
|
|
uint32_t sizes[] = {localWorkgroupSizeX, localWorkgroupSizeY, localWorkgroupSizeZ};
|
|
|
|
uint32_t flattenedId = 0;
|
|
for (uint32_t id2 = 0; id2 < sizes[walkOrder[2]]; ++id2) {
|
|
for (uint32_t id1 = 0; id1 < sizes[walkOrder[1]]; ++id1) {
|
|
for (uint32_t id0 = 0; id0 < sizes[walkOrder[0]]; ++id0) {
|
|
uint32_t threadId = flattenedId / simd;
|
|
uint32_t channelId = flattenedId % simd;
|
|
uint16_t foundId0 = ids[walkOrder[0]][channelId + threadId * skipPerThread * 3];
|
|
uint16_t foundId1 = ids[walkOrder[1]][channelId + threadId * skipPerThread * 3];
|
|
uint16_t foundId2 = ids[walkOrder[2]][channelId + threadId * skipPerThread * 3];
|
|
if ((id0 != foundId0) || (id1 != foundId1) || (id2 != foundId2)) {
|
|
EXPECT_EQ(id0, foundId0) << simd << " X @ (" << id0 << ", " << id1 << ", " << id2 << ") - flat " << flattenedId;
|
|
EXPECT_EQ(id1, foundId1) << simd << " Y @ (" << id0 << ", " << id1 << ", " << id2 << ") - flat " << flattenedId;
|
|
EXPECT_EQ(id2, foundId2) << simd << " Z @ (" << id0 << ", " << id1 << ", " << id2 << ") - flat " << flattenedId;
|
|
}
|
|
++flattenedId;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void dumpBuffer(uint32_t simd, uint32_t lwsX, uint32_t lwsY, uint32_t lwsZ) {
|
|
auto workSize = lwsX * lwsY * lwsZ;
|
|
auto threads = Math::divideAndRoundUp(workSize, simd);
|
|
|
|
auto pBuffer = buffer;
|
|
|
|
// As per BackEnd HLD, SIMD32 has 32 localIDs per channel. SIMD8/16 has up to 16 localIDs.
|
|
auto skipPerThread = simd == 32 ? 32 : 16;
|
|
|
|
while (threads-- > 0) {
|
|
auto lanes = std::min(workSize, simd);
|
|
|
|
for (auto dimension = 0u; dimension < 3u; ++dimension) {
|
|
for (auto lane = 0u; lane < lanes; ++lane) {
|
|
printf("%04d ", (unsigned int)pBuffer[lane]);
|
|
}
|
|
pBuffer += skipPerThread;
|
|
printf("\n");
|
|
}
|
|
|
|
workSize -= simd;
|
|
}
|
|
}
|
|
|
|
// Test parameters
|
|
uint32_t localWorkSizeX;
|
|
uint32_t localWorkSizeY;
|
|
uint32_t localWorkSizeZ;
|
|
uint32_t localWorkSize;
|
|
uint32_t simd;
|
|
uint32_t grfSize;
|
|
|
|
// Provide support for a max LWS of 256
|
|
// 32 threads @ SIMD8
|
|
// 3 channels (x/y/z)
|
|
// 16 lanes per thread (SIMD8 - only 8 used)
|
|
uint16_t *buffer;
|
|
};
|
|
|
|
HWTEST_P(LocalIDFixture, WhenGeneratingLocalIdsThenIdsAreWithinLimits) {
|
|
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
|
|
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
|
|
generateLocalIDs(buffer, simd, std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizeX), static_cast<uint16_t>(localWorkSizeY), static_cast<uint16_t>(localWorkSizeZ)}},
|
|
std::array<uint8_t, 3>{{0, 1, 2}}, false, grfSize, rootDeviceEnvironment);
|
|
validateIDWithinLimits(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, UnitTestHelper<FamilyType>::useFullRowForLocalIdsGeneration);
|
|
}
|
|
|
|
HWTEST_P(LocalIDFixture, WhenGeneratingLocalIdsThenAllWorkItemsCovered) {
|
|
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
|
|
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
|
|
generateLocalIDs(buffer, simd, std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizeX), static_cast<uint16_t>(localWorkSizeY), static_cast<uint16_t>(localWorkSizeZ)}},
|
|
std::array<uint8_t, 3>{{0, 1, 2}}, false, grfSize, rootDeviceEnvironment);
|
|
validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, UnitTestHelper<FamilyType>::useFullRowForLocalIdsGeneration);
|
|
}
|
|
|
|
HWTEST_P(LocalIDFixture, WhenWalkOrderIsXyzThenProperLocalIdsAreGenerated) {
|
|
auto dimensionsOrder = std::array<uint8_t, 3>{{0, 1, 2}};
|
|
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
|
|
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
|
|
generateLocalIDs(buffer, simd, std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizeX), static_cast<uint16_t>(localWorkSizeY), static_cast<uint16_t>(localWorkSizeZ)}},
|
|
dimensionsOrder, false, grfSize, rootDeviceEnvironment);
|
|
validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, UnitTestHelper<FamilyType>::useFullRowForLocalIdsGeneration);
|
|
validateWalkOrder(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, dimensionsOrder);
|
|
}
|
|
|
|
HWTEST_P(LocalIDFixture, WhenWalkOrderIsYxzThenProperLocalIdsAreGenerated) {
|
|
auto dimensionsOrder = std::array<uint8_t, 3>{{1, 0, 2}};
|
|
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
|
|
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
|
|
generateLocalIDs(buffer, simd, std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizeX), static_cast<uint16_t>(localWorkSizeY), static_cast<uint16_t>(localWorkSizeZ)}},
|
|
dimensionsOrder, false, grfSize, rootDeviceEnvironment);
|
|
validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, UnitTestHelper<FamilyType>::useFullRowForLocalIdsGeneration);
|
|
validateWalkOrder(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, dimensionsOrder);
|
|
}
|
|
|
|
HWTEST_P(LocalIDFixture, WhenWalkOrderIsZyxThenProperLocalIdsAreGenerated) {
|
|
auto dimensionsOrder = std::array<uint8_t, 3>{{2, 1, 0}};
|
|
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
|
|
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
|
|
generateLocalIDs(buffer, simd, std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizeX), static_cast<uint16_t>(localWorkSizeY), static_cast<uint16_t>(localWorkSizeZ)}},
|
|
dimensionsOrder, false, grfSize, rootDeviceEnvironment);
|
|
validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, UnitTestHelper<FamilyType>::useFullRowForLocalIdsGeneration);
|
|
validateWalkOrder(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, dimensionsOrder);
|
|
}
|
|
|
|
TEST_P(LocalIDFixture, WhenThreadsPerWgAreGeneratedThenSizeCalculationAreCorrect) {
|
|
auto workItems = static_cast<uint32_t>(localWorkSizeX * localWorkSizeY * localWorkSizeZ);
|
|
auto sizeTotalPerThreadData = getThreadsPerWG(simd, workItems) * getPerThreadSizeLocalIDs(simd, grfSize);
|
|
|
|
// Should be multiple of GRFs
|
|
EXPECT_EQ(0u, sizeTotalPerThreadData % grfSize);
|
|
|
|
auto numGRFsPerThread = (simd == 32) ? 2 : 1;
|
|
auto numThreadsExpected = Math::divideAndRoundUp(workItems, simd);
|
|
auto numGRFsExpected = 3 * numGRFsPerThread * numThreadsExpected;
|
|
EXPECT_EQ(numGRFsExpected * grfSize, sizeTotalPerThreadData);
|
|
}
|
|
|
|
struct LocalIdsLayoutForImagesTest : ::testing::TestWithParam<std::tuple<uint16_t, uint16_t, uint16_t, uint16_t>> {
|
|
void SetUp() override {
|
|
simd = std::get<0>(GetParam());
|
|
grfSize = std::get<1>(GetParam());
|
|
localWorkSize = {{std::get<2>(GetParam()),
|
|
std::get<3>(GetParam()),
|
|
1u}};
|
|
rowWidth = simd == 32u ? 32u : 16u;
|
|
xDelta = simd == 8u ? 2u : 4u;
|
|
}
|
|
|
|
void generateLocalIds() {
|
|
auto numGrfs = (localWorkSize.at(0) * localWorkSize.at(1) + (simd - 1)) / simd;
|
|
elemsInBuffer = 3u * simd * numGrfs;
|
|
if (simd == 8u) {
|
|
elemsInBuffer *= 2;
|
|
}
|
|
size = elemsInBuffer * sizeof(uint16_t);
|
|
memory = allocateAlignedMemory(size, 32);
|
|
memset(memory.get(), 0xff, size);
|
|
buffer = reinterpret_cast<uint16_t *>(memory.get());
|
|
EXPECT_TRUE(isCompatibleWithLayoutForImages(localWorkSize, dimensionsOrder, simd));
|
|
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
|
|
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
|
|
generateLocalIDs(buffer, simd, localWorkSize, dimensionsOrder, true, grfSize, rootDeviceEnvironment);
|
|
}
|
|
void validateGRF() {
|
|
uint32_t totalLocalIds = localWorkSize.at(0) * localWorkSize.at(1);
|
|
auto numRows = elemsInBuffer / rowWidth;
|
|
auto numGrfs = numRows / 3u;
|
|
for (auto i = 0u; i < numGrfs; i++) {
|
|
|
|
// validate X row
|
|
uint16_t baseX = buffer[i * 3 * rowWidth];
|
|
uint16_t baseY = buffer[i * 3 * rowWidth + rowWidth];
|
|
uint16_t currentX = baseX;
|
|
for (int j = 1; j < simd; j++) {
|
|
if (simd * i + j == totalLocalIds)
|
|
break;
|
|
if (simd == 32u && baseY + 8u > localWorkSize.at(1) && j == 16u) {
|
|
baseX += xDelta;
|
|
if (baseX == localWorkSize.at(0)) {
|
|
baseX = 0;
|
|
}
|
|
}
|
|
currentX = baseX + ((currentX + 1) & (xDelta - 1));
|
|
EXPECT_EQ(buffer[i * 3 * rowWidth + j], currentX);
|
|
}
|
|
|
|
// validate Y row
|
|
for (int j = 0; j < simd; j++) {
|
|
if (simd * i + j == totalLocalIds)
|
|
break;
|
|
uint16_t expectedY = baseY + ((j / xDelta) & 0b111);
|
|
if (expectedY >= localWorkSize.at(1)) {
|
|
expectedY -= (localWorkSize.at(1) - baseY);
|
|
}
|
|
EXPECT_EQ(buffer[i * 3 * rowWidth + rowWidth + j], expectedY);
|
|
}
|
|
|
|
// validate Z row
|
|
for (int j = 0; j < simd; j++) {
|
|
if (simd * i + j == totalLocalIds)
|
|
break;
|
|
EXPECT_EQ(buffer[i * 3 * rowWidth + 2 * rowWidth + j], 0u);
|
|
}
|
|
}
|
|
}
|
|
uint16_t simd;
|
|
uint16_t grfSize;
|
|
uint8_t rowWidth;
|
|
uint16_t xDelta;
|
|
std::array<uint16_t, 3> localWorkSize;
|
|
std::array<uint8_t, 3> dimensionsOrder = {{0u, 1u, 2u}};
|
|
uint32_t elemsInBuffer;
|
|
uint32_t size;
|
|
std::unique_ptr<void, std::function<decltype(alignedFree)>> memory;
|
|
uint16_t *buffer;
|
|
};
|
|
|
|
TEST(LocalIdsLayoutForImagesTest, givenLocalWorkSizeCompatibleWithLayoutForImagesWithDefaultDimensionsOrderWhenCheckLayoutForImagesCompatibilityThenReturnTrue) {
|
|
std::array<uint16_t, 3> localWorkSize{{4u, 4u, 1u}};
|
|
std::array<uint8_t, 3> dimensionsOrder = {{0u, 1u, 2u}};
|
|
EXPECT_TRUE(isCompatibleWithLayoutForImages(localWorkSize, dimensionsOrder, 16u));
|
|
EXPECT_TRUE(isCompatibleWithLayoutForImages({{4u, 12u, 1u}}, dimensionsOrder, 32u));
|
|
}
|
|
|
|
TEST(LocalIdsLayoutForImagesTest, givenLocalWorkSizeNotCompatibleWithLayoutForImagesWithDefaultDimensionsOrderWhenCheckLayoutForImagesCompatibilityThenReturnFalse) {
|
|
std::array<uint8_t, 3> dimensionsOrder = {{0u, 1u, 2u}};
|
|
EXPECT_FALSE(isCompatibleWithLayoutForImages({{4u, 4u, 2u}}, dimensionsOrder, 8u));
|
|
EXPECT_FALSE(isCompatibleWithLayoutForImages({{2u, 5u, 1u}}, dimensionsOrder, 8u));
|
|
EXPECT_FALSE(isCompatibleWithLayoutForImages({{1u, 4u, 1u}}, dimensionsOrder, 8u));
|
|
}
|
|
|
|
TEST(LocalIdsLayoutForImagesTest, given4x4x1LocalWorkSizeWithNonDefaultDimensionsOrderWhenCheckLayoutForImagesCompatibilityThenReturnFalse) {
|
|
std::array<uint16_t, 3> localWorkSize{{2u, 4u, 1u}};
|
|
EXPECT_FALSE(isCompatibleWithLayoutForImages(localWorkSize, {{0, 2, 1}}, 8u));
|
|
EXPECT_FALSE(isCompatibleWithLayoutForImages(localWorkSize, {{1, 0, 2}}, 8u));
|
|
EXPECT_FALSE(isCompatibleWithLayoutForImages(localWorkSize, {{1, 2, 0}}, 8u));
|
|
EXPECT_FALSE(isCompatibleWithLayoutForImages(localWorkSize, {{2, 0, 1}}, 8u));
|
|
EXPECT_FALSE(isCompatibleWithLayoutForImages(localWorkSize, {{2, 1, 0}}, 8u));
|
|
}
|
|
|
|
using LocalIdsLayoutTest = ::testing::TestWithParam<uint16_t>;
|
|
|
|
TEST_P(LocalIdsLayoutTest, givenLocalWorkgroupSize4x4x1WhenGenerateLocalIdsThenHasKernelImagesOnlyFlagDoesntMatter) {
|
|
uint16_t simd = GetParam();
|
|
uint8_t rowWidth = simd == 32 ? 32 : 16;
|
|
uint16_t xDelta = simd == 8u ? 2u : 4u;
|
|
std::array<uint16_t, 3> localWorkSize{{xDelta, 4u, 1u}};
|
|
uint16_t totalLocalWorkSize = 4u * xDelta;
|
|
auto dimensionsOrder = std::array<uint8_t, 3>{{0u, 1u, 2u}};
|
|
uint32_t grfSize = 32;
|
|
|
|
auto elemsInBuffer = rowWidth * 3u;
|
|
auto size = elemsInBuffer * sizeof(uint16_t);
|
|
|
|
auto alignedMemory1 = allocateAlignedMemory(size, 32);
|
|
auto buffer1 = reinterpret_cast<uint16_t *>(alignedMemory1.get());
|
|
memset(buffer1, 0xff, size);
|
|
|
|
auto alignedMemory2 = allocateAlignedMemory(size, 32);
|
|
auto buffer2 = reinterpret_cast<uint16_t *>(alignedMemory2.get());
|
|
memset(buffer2, 0xff, size);
|
|
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
|
|
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
|
|
generateLocalIDs(buffer1, simd, localWorkSize, dimensionsOrder, false, grfSize, rootDeviceEnvironment);
|
|
generateLocalIDs(buffer2, simd, localWorkSize, dimensionsOrder, true, grfSize, rootDeviceEnvironment);
|
|
|
|
for (auto i = 0u; i < elemsInBuffer / rowWidth; i++) {
|
|
for (auto j = 0u; j < rowWidth; j++) {
|
|
if (j < totalLocalWorkSize) {
|
|
auto offset = (i * rowWidth + j) * sizeof(uint16_t);
|
|
auto cmpValue = memcmp(ptrOffset(buffer1, offset), ptrOffset(buffer2, offset), sizeof(uint16_t));
|
|
EXPECT_EQ(0, cmpValue);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
TEST_P(LocalIdsLayoutForImagesTest, givenLocalWorkgroupSizeCompatibleWithLayoutForImagesWhenGenerateLocalIdsWithKernelWithOnlyImagesThenAppliesLayoutForImages) {
|
|
generateLocalIds();
|
|
validateGRF();
|
|
}
|
|
|
|
#define SIMDParams ::testing::Values(8, 16, 32)
|
|
#if HEAVY_DUTY_TESTING
|
|
#define LWSXParams ::testing::Values(1, 7, 8, 9, 15, 16, 17, 31, 32, 33, 64, 128, 256)
|
|
#define LWSYParams ::testing::Values(1, 2, 3, 4, 5, 6, 7, 8)
|
|
#define LWSZParams ::testing::Values(1, 2, 3, 4)
|
|
#else
|
|
#define LWSXParams ::testing::Values(1, 7, 8, 9, 15, 16, 17, 31, 32, 33, 64, 128, 256)
|
|
#define LWSYParams ::testing::Values(1, 2, 4, 8)
|
|
#define LWSZParams ::testing::Values(1)
|
|
#endif
|
|
|
|
#define GRFSizeParams ::testing::Values(32)
|
|
|
|
INSTANTIATE_TEST_CASE_P(AllCombinations, LocalIDFixture, ::testing::Combine(SIMDParams, GRFSizeParams, LWSXParams, LWSYParams, LWSZParams));
|
|
INSTANTIATE_TEST_CASE_P(LayoutTests, LocalIdsLayoutTest, SIMDParams);
|
|
INSTANTIATE_TEST_CASE_P(LayoutForImagesTests, LocalIdsLayoutForImagesTest, ::testing::Combine(SIMDParams, GRFSizeParams, ::testing::Values(4, 8, 12, 20), ::testing::Values(4, 8, 12, 20)));
|
|
|
|
// To debug a specific configuration replace the list of Values with specific values.
|
|
// NOTE: You'll need a unique test prefix
|
|
INSTANTIATE_TEST_CASE_P(SingleTest, LocalIDFixture,
|
|
::testing::Combine(
|
|
::testing::Values(32), // SIMD
|
|
::testing::Values(32), // GRF
|
|
::testing::Values(5), // LWSX
|
|
::testing::Values(6), // LWSY
|
|
::testing::Values(7))); // LWSZ
|