Fix attention bitmask on tgllp

Signed-off-by: Mateusz Hoppe <mateusz.hoppe@intel.com>
This commit is contained in:
Mateusz Hoppe
2021-08-31 18:18:36 +00:00
committed by Compute-Runtime-Automation
parent 754d6e40e0
commit 582bb3786d
2 changed files with 54 additions and 33 deletions

View File

@@ -11,9 +11,11 @@ void L0HwHelperHw<Family>::getAttentionBitmaskForSingleThreads(std::vector<ze_de
const uint32_t numThreadsPerEu = (hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.EUCount);
const uint32_t bytesPerEu = alignUp(numThreadsPerEu, 8) / 8;
const uint32_t numEuPerSubslice = std::min(hwInfo.gtSystemInfo.MaxEuPerSubSlice, 8u);
const bool attBitsReused = !(numEuPerSubslice < 8u);
const uint32_t threadsSizePerSlice = numSubslicesPerSlice * numEuPerSubslice * bytesPerEu;
const uint32_t eusPerRow = 4;
const uint32_t numberOfRows = 2;
bitmaskSize = hwInfo.gtSystemInfo.MaxSubSlicesSupported * numEuPerSubslice * bytesPerEu;
bitmask = std::make_unique<uint8_t[]>(bitmaskSize);
@@ -23,8 +25,9 @@ void L0HwHelperHw<Family>::getAttentionBitmaskForSingleThreads(std::vector<ze_de
uint8_t *sliceData = ptrOffset(bitmask.get(), threadsSizePerSlice * thread.slice);
uint8_t *subsliceData = ptrOffset(sliceData, numEuPerSubslice * bytesPerEu * thread.subslice);
auto dualEu = attBitsReused ? thread.eu % numEuPerSubslice : thread.eu;
uint8_t *euData = ptrOffset(subsliceData, bytesPerEu * dualEu);
auto eu = thread.eu % eusPerRow;
auto dualEu = thread.eu / (numberOfRows * eusPerRow);
uint8_t *euData = ptrOffset(subsliceData, bytesPerEu * (eu + dualEu * eusPerRow));
UNRECOVERABLE_IF(thread.thread > 7);
*euData |= (1 << thread.thread);
}
@@ -38,6 +41,8 @@ std::vector<ze_device_thread_t> L0HwHelperHw<Family>::getThreadsFromAttentionBit
const uint32_t bytesPerEu = alignUp(numThreadsPerEu, 8) / 8;
const uint32_t threadsSizePerSlice = numSubslicesPerSlice * numEuPerSubslice * bytesPerEu;
const uint32_t threadsSizePerSubSlice = numEuPerSubslice * bytesPerEu;
const uint32_t eusPerRow = 4;
const uint32_t numberOfRows = 2;
UNRECOVERABLE_IF(bytesPerEu != 1);
std::vector<ze_device_thread_t> threads;
@@ -45,20 +50,22 @@ std::vector<ze_device_thread_t> L0HwHelperHw<Family>::getThreadsFromAttentionBit
for (uint32_t slice = 0; slice < hwInfo.gtSystemInfo.MaxSlicesSupported; slice++) {
for (uint32_t subslice = 0; subslice < numSubslicesPerSlice; subslice++) {
for (uint32_t eu = 0; eu < numEuPerSubslice; eu++) {
size_t subSliceOffset = slice * threadsSizePerSlice + subslice * threadsSizePerSubSlice;
size_t offset = slice * threadsSizePerSlice + subslice * threadsSizePerSubSlice + eu * bytesPerEu;
for (uint32_t dualEu = 0; dualEu < numberOfRows; dualEu++) {
for (uint32_t euIndex = 0; euIndex < eusPerRow; euIndex++) {
if (offset >= bitmaskSize) {
return threads;
}
auto offset = subSliceOffset + euIndex + dualEu * eusPerRow;
for (uint32_t dualEu = 0; dualEu < hwInfo.gtSystemInfo.MaxEuPerSubSlice / numEuPerSubslice; dualEu++) {
if (offset >= bitmaskSize) {
return threads;
}
std::bitset<8> bits(bitmask[offset]);
for (uint32_t i = 0; i < 8; i++) {
if (bits.test(i)) {
threads.emplace_back(ze_device_thread_t{slice, subslice, eu + numEuPerSubslice * dualEu, i});
threads.emplace_back(ze_device_thread_t{slice, subslice, euIndex + numEuPerSubslice * dualEu, i});
threads.emplace_back(ze_device_thread_t{slice, subslice, euIndex + eusPerRow + numEuPerSubslice * dualEu, i});
}
}
}

View File

@@ -76,7 +76,7 @@ HWTEST_F(L0HwHelperTest, givenSliceSubsliceEuAndThreadIdsWhenGettingBitmaskThenC
uint32_t subslice = subslicesPerSlice > 1 ? subslicesPerSlice - 1 : 0;
const auto threadsPerEu = (hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.EUCount);
const auto bytesPerEu = threadsPerEu <= 8 ? 1 : 2;
const auto bytesPerEu = 1;
const auto maxEUsInAtt = hwInfo.gtSystemInfo.MaxEuPerSubSlice > 8 ? 8 : hwInfo.gtSystemInfo.MaxEuPerSubSlice;
const auto threadsSizePerSubSlice = maxEUsInAtt * bytesPerEu;
@@ -141,7 +141,12 @@ HWTEST_F(L0HwHelperTest, givenSliceSubsliceEuAndThreadIdsWhenGettingBitmaskThenC
data = ptrOffset(data, (hwInfo.gtSystemInfo.MaxSlicesSupported - 1) * threadsSizePerSlice);
data = ptrOffset(data, subslice * threadsSizePerSubSlice);
data = ptrOffset(data, maxEUsInAtt - 1 * bytesPerEu);
if (l0HwHelper.isResumeWARequired()) {
data = ptrOffset(data, (maxEUsInAtt - 1) % 4 * bytesPerEu);
} else {
data = ptrOffset(data, maxEUsInAtt - 1 * bytesPerEu);
}
data[0] = 1;
printAttentionBitmask(expectedBitmask.get(), bitmask.get(), hwInfo.gtSystemInfo.MaxSlicesSupported, subslicesPerSlice, hwInfo.gtSystemInfo.MaxEuPerSubSlice, threadsPerEu);
@@ -343,7 +348,7 @@ HWTEST2_F(L0HwHelperFusedEuTest, givenBitmaskWithAttentionBitsForSingleThreadWhe
EXPECT_EQ(0u, threads[1].slice);
EXPECT_EQ(subsliceID, threads[1].subslice);
EXPECT_EQ(8u, threads[1].eu);
EXPECT_EQ(4u, threads[1].eu);
EXPECT_EQ(threadID, threads[1].thread);
}
@@ -381,7 +386,7 @@ HWTEST2_F(L0HwHelperFusedEuTest, givenBitmaskWithAttentionBitsForAllSubslicesWhe
threadIndex++;
EXPECT_EQ(threadID, threads[threadIndex].thread);
EXPECT_EQ(8u, threads[threadIndex].eu);
EXPECT_EQ(4u, threads[threadIndex].eu);
threadIndex++;
}
}
@@ -408,9 +413,9 @@ HWTEST2_F(L0HwHelperFusedEuTest, givenBitmaskWithAttentionBitsForAllEUsWhenGetti
l0HwHelper.getAttentionBitmaskForSingleThreads(threadsWithAtt, hwInfo, bitmask, size);
auto threads = l0HwHelper.getThreadsFromAttentionBitmask(hwInfo, bitmask.get(), size);
ASSERT_EQ(maxEUsInAtt * 2, threads.size());
ASSERT_EQ(maxEUsInAtt, threads.size());
uint32_t expectedEUs[] = {0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15};
uint32_t expectedEUs[] = {0, 4, 1, 5, 2, 6, 3, 7};
for (uint32_t i = 0; i < threads.size(); i++) {
EXPECT_EQ(0u, threads[i].slice);
@@ -434,22 +439,31 @@ HWTEST2_F(L0HwHelperFusedEuTest, givenEu0To1Threads0To3BitmaskWhenGettingThreads
ASSERT_EQ(16u, threads.size());
ze_device_thread_t expectedThreads[] = {
{0, 0, 0, 0},
{0, 0, 0, 1},
{0, 0, 0, 2},
{0, 0, 0, 3},
{0, 0, 8, 0},
{0, 0, 8, 1},
{0, 0, 8, 2},
{0, 0, 8, 3},
{0, 0, 1, 0},
{0, 0, 1, 1},
{0, 0, 1, 2},
{0, 0, 1, 3},
{0, 0, 9, 0},
{0, 0, 9, 1},
{0, 0, 9, 2},
{0, 0, 9, 3}};
{0, 0, 0, 0}, {0, 0, 4, 0}, {0, 0, 0, 1}, {0, 0, 4, 1}, {0, 0, 0, 2}, {0, 0, 4, 2}, {0, 0, 0, 3}, {0, 0, 4, 3}, {0, 0, 1, 0}, {0, 0, 5, 0}, {0, 0, 1, 1}, {0, 0, 5, 1}, {0, 0, 1, 2}, {0, 0, 5, 2}, {0, 0, 1, 3}, {0, 0, 5, 3}};
for (uint32_t i = 0; i < 16u; i++) {
EXPECT_EQ(expectedThreads[i].slice, threads[i].slice);
EXPECT_EQ(expectedThreads[i].subslice, threads[i].subslice);
EXPECT_EQ(expectedThreads[i].eu, threads[i].eu);
EXPECT_EQ(expectedThreads[i].thread, threads[i].thread);
}
}
HWTEST2_F(L0HwHelperFusedEuTest, givenEu8To9Threads0To3BitmaskWhenGettingThreadsThenCorrectThreadsAreReturned, PlatformsWithFusedEus) {
auto hwInfo = *NEO::defaultHwInfo.get();
if (hwInfo.gtSystemInfo.MaxEuPerSubSlice <= 8) {
GTEST_SKIP();
}
auto &l0HwHelper = L0::L0HwHelper::get(hwInfo.platform.eRenderCoreFamily);
uint8_t data[] = {0x00, 0x00, 0x00, 0x00, 0x0f, 0x0f};
auto threads = l0HwHelper.getThreadsFromAttentionBitmask(hwInfo, data, sizeof(data));
ASSERT_EQ(16u, threads.size());
ze_device_thread_t expectedThreads[] = {
{0, 0, 8, 0}, {0, 0, 12, 0}, {0, 0, 8, 1}, {0, 0, 12, 1}, {0, 0, 8, 2}, {0, 0, 12, 2}, {0, 0, 8, 3}, {0, 0, 12, 3}, {0, 0, 9, 0}, {0, 0, 13, 0}, {0, 0, 9, 1}, {0, 0, 13, 1}, {0, 0, 9, 2}, {0, 0, 13, 2}, {0, 0, 9, 3}, {0, 0, 13, 3}};
for (uint32_t i = 0; i < 16u; i++) {
EXPECT_EQ(expectedThreads[i].slice, threads[i].slice);
@@ -500,7 +514,7 @@ HWTEST2_F(L0HwHelperFusedEuTest, givenBitmaskWithAttentionBitsForHalfOfThreadsWh
} else {
EXPECT_EQ(0u, threads[i].slice);
EXPECT_EQ(subsliceIndex, threads[i].subslice);
EXPECT_EQ(8u, threads[i].eu);
EXPECT_EQ(4u, threads[i].eu);
EXPECT_EQ(threadID, threads[i].thread);
subsliceIndex++;