mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-24 21:18:24 +08:00
Fix attention bitmask on tgllp
Signed-off-by: Mateusz Hoppe <mateusz.hoppe@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
754d6e40e0
commit
582bb3786d
@@ -11,9 +11,11 @@ void L0HwHelperHw<Family>::getAttentionBitmaskForSingleThreads(std::vector<ze_de
|
||||
const uint32_t numThreadsPerEu = (hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.EUCount);
|
||||
const uint32_t bytesPerEu = alignUp(numThreadsPerEu, 8) / 8;
|
||||
const uint32_t numEuPerSubslice = std::min(hwInfo.gtSystemInfo.MaxEuPerSubSlice, 8u);
|
||||
const bool attBitsReused = !(numEuPerSubslice < 8u);
|
||||
const uint32_t threadsSizePerSlice = numSubslicesPerSlice * numEuPerSubslice * bytesPerEu;
|
||||
|
||||
const uint32_t eusPerRow = 4;
|
||||
const uint32_t numberOfRows = 2;
|
||||
|
||||
bitmaskSize = hwInfo.gtSystemInfo.MaxSubSlicesSupported * numEuPerSubslice * bytesPerEu;
|
||||
bitmask = std::make_unique<uint8_t[]>(bitmaskSize);
|
||||
|
||||
@@ -23,8 +25,9 @@ void L0HwHelperHw<Family>::getAttentionBitmaskForSingleThreads(std::vector<ze_de
|
||||
uint8_t *sliceData = ptrOffset(bitmask.get(), threadsSizePerSlice * thread.slice);
|
||||
uint8_t *subsliceData = ptrOffset(sliceData, numEuPerSubslice * bytesPerEu * thread.subslice);
|
||||
|
||||
auto dualEu = attBitsReused ? thread.eu % numEuPerSubslice : thread.eu;
|
||||
uint8_t *euData = ptrOffset(subsliceData, bytesPerEu * dualEu);
|
||||
auto eu = thread.eu % eusPerRow;
|
||||
auto dualEu = thread.eu / (numberOfRows * eusPerRow);
|
||||
uint8_t *euData = ptrOffset(subsliceData, bytesPerEu * (eu + dualEu * eusPerRow));
|
||||
UNRECOVERABLE_IF(thread.thread > 7);
|
||||
*euData |= (1 << thread.thread);
|
||||
}
|
||||
@@ -38,6 +41,8 @@ std::vector<ze_device_thread_t> L0HwHelperHw<Family>::getThreadsFromAttentionBit
|
||||
const uint32_t bytesPerEu = alignUp(numThreadsPerEu, 8) / 8;
|
||||
const uint32_t threadsSizePerSlice = numSubslicesPerSlice * numEuPerSubslice * bytesPerEu;
|
||||
const uint32_t threadsSizePerSubSlice = numEuPerSubslice * bytesPerEu;
|
||||
const uint32_t eusPerRow = 4;
|
||||
const uint32_t numberOfRows = 2;
|
||||
|
||||
UNRECOVERABLE_IF(bytesPerEu != 1);
|
||||
std::vector<ze_device_thread_t> threads;
|
||||
@@ -45,20 +50,22 @@ std::vector<ze_device_thread_t> L0HwHelperHw<Family>::getThreadsFromAttentionBit
|
||||
for (uint32_t slice = 0; slice < hwInfo.gtSystemInfo.MaxSlicesSupported; slice++) {
|
||||
for (uint32_t subslice = 0; subslice < numSubslicesPerSlice; subslice++) {
|
||||
|
||||
for (uint32_t eu = 0; eu < numEuPerSubslice; eu++) {
|
||||
size_t subSliceOffset = slice * threadsSizePerSlice + subslice * threadsSizePerSubSlice;
|
||||
|
||||
size_t offset = slice * threadsSizePerSlice + subslice * threadsSizePerSubSlice + eu * bytesPerEu;
|
||||
for (uint32_t dualEu = 0; dualEu < numberOfRows; dualEu++) {
|
||||
for (uint32_t euIndex = 0; euIndex < eusPerRow; euIndex++) {
|
||||
|
||||
if (offset >= bitmaskSize) {
|
||||
return threads;
|
||||
}
|
||||
auto offset = subSliceOffset + euIndex + dualEu * eusPerRow;
|
||||
|
||||
for (uint32_t dualEu = 0; dualEu < hwInfo.gtSystemInfo.MaxEuPerSubSlice / numEuPerSubslice; dualEu++) {
|
||||
if (offset >= bitmaskSize) {
|
||||
return threads;
|
||||
}
|
||||
|
||||
std::bitset<8> bits(bitmask[offset]);
|
||||
for (uint32_t i = 0; i < 8; i++) {
|
||||
if (bits.test(i)) {
|
||||
threads.emplace_back(ze_device_thread_t{slice, subslice, eu + numEuPerSubslice * dualEu, i});
|
||||
threads.emplace_back(ze_device_thread_t{slice, subslice, euIndex + numEuPerSubslice * dualEu, i});
|
||||
threads.emplace_back(ze_device_thread_t{slice, subslice, euIndex + eusPerRow + numEuPerSubslice * dualEu, i});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -76,7 +76,7 @@ HWTEST_F(L0HwHelperTest, givenSliceSubsliceEuAndThreadIdsWhenGettingBitmaskThenC
|
||||
uint32_t subslice = subslicesPerSlice > 1 ? subslicesPerSlice - 1 : 0;
|
||||
|
||||
const auto threadsPerEu = (hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.EUCount);
|
||||
const auto bytesPerEu = threadsPerEu <= 8 ? 1 : 2;
|
||||
const auto bytesPerEu = 1;
|
||||
|
||||
const auto maxEUsInAtt = hwInfo.gtSystemInfo.MaxEuPerSubSlice > 8 ? 8 : hwInfo.gtSystemInfo.MaxEuPerSubSlice;
|
||||
const auto threadsSizePerSubSlice = maxEUsInAtt * bytesPerEu;
|
||||
@@ -141,7 +141,12 @@ HWTEST_F(L0HwHelperTest, givenSliceSubsliceEuAndThreadIdsWhenGettingBitmaskThenC
|
||||
|
||||
data = ptrOffset(data, (hwInfo.gtSystemInfo.MaxSlicesSupported - 1) * threadsSizePerSlice);
|
||||
data = ptrOffset(data, subslice * threadsSizePerSubSlice);
|
||||
data = ptrOffset(data, maxEUsInAtt - 1 * bytesPerEu);
|
||||
|
||||
if (l0HwHelper.isResumeWARequired()) {
|
||||
data = ptrOffset(data, (maxEUsInAtt - 1) % 4 * bytesPerEu);
|
||||
} else {
|
||||
data = ptrOffset(data, maxEUsInAtt - 1 * bytesPerEu);
|
||||
}
|
||||
data[0] = 1;
|
||||
|
||||
printAttentionBitmask(expectedBitmask.get(), bitmask.get(), hwInfo.gtSystemInfo.MaxSlicesSupported, subslicesPerSlice, hwInfo.gtSystemInfo.MaxEuPerSubSlice, threadsPerEu);
|
||||
@@ -343,7 +348,7 @@ HWTEST2_F(L0HwHelperFusedEuTest, givenBitmaskWithAttentionBitsForSingleThreadWhe
|
||||
|
||||
EXPECT_EQ(0u, threads[1].slice);
|
||||
EXPECT_EQ(subsliceID, threads[1].subslice);
|
||||
EXPECT_EQ(8u, threads[1].eu);
|
||||
EXPECT_EQ(4u, threads[1].eu);
|
||||
EXPECT_EQ(threadID, threads[1].thread);
|
||||
}
|
||||
|
||||
@@ -381,7 +386,7 @@ HWTEST2_F(L0HwHelperFusedEuTest, givenBitmaskWithAttentionBitsForAllSubslicesWhe
|
||||
|
||||
threadIndex++;
|
||||
EXPECT_EQ(threadID, threads[threadIndex].thread);
|
||||
EXPECT_EQ(8u, threads[threadIndex].eu);
|
||||
EXPECT_EQ(4u, threads[threadIndex].eu);
|
||||
threadIndex++;
|
||||
}
|
||||
}
|
||||
@@ -408,9 +413,9 @@ HWTEST2_F(L0HwHelperFusedEuTest, givenBitmaskWithAttentionBitsForAllEUsWhenGetti
|
||||
l0HwHelper.getAttentionBitmaskForSingleThreads(threadsWithAtt, hwInfo, bitmask, size);
|
||||
auto threads = l0HwHelper.getThreadsFromAttentionBitmask(hwInfo, bitmask.get(), size);
|
||||
|
||||
ASSERT_EQ(maxEUsInAtt * 2, threads.size());
|
||||
ASSERT_EQ(maxEUsInAtt, threads.size());
|
||||
|
||||
uint32_t expectedEUs[] = {0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15};
|
||||
uint32_t expectedEUs[] = {0, 4, 1, 5, 2, 6, 3, 7};
|
||||
for (uint32_t i = 0; i < threads.size(); i++) {
|
||||
|
||||
EXPECT_EQ(0u, threads[i].slice);
|
||||
@@ -434,22 +439,31 @@ HWTEST2_F(L0HwHelperFusedEuTest, givenEu0To1Threads0To3BitmaskWhenGettingThreads
|
||||
ASSERT_EQ(16u, threads.size());
|
||||
|
||||
ze_device_thread_t expectedThreads[] = {
|
||||
{0, 0, 0, 0},
|
||||
{0, 0, 0, 1},
|
||||
{0, 0, 0, 2},
|
||||
{0, 0, 0, 3},
|
||||
{0, 0, 8, 0},
|
||||
{0, 0, 8, 1},
|
||||
{0, 0, 8, 2},
|
||||
{0, 0, 8, 3},
|
||||
{0, 0, 1, 0},
|
||||
{0, 0, 1, 1},
|
||||
{0, 0, 1, 2},
|
||||
{0, 0, 1, 3},
|
||||
{0, 0, 9, 0},
|
||||
{0, 0, 9, 1},
|
||||
{0, 0, 9, 2},
|
||||
{0, 0, 9, 3}};
|
||||
{0, 0, 0, 0}, {0, 0, 4, 0}, {0, 0, 0, 1}, {0, 0, 4, 1}, {0, 0, 0, 2}, {0, 0, 4, 2}, {0, 0, 0, 3}, {0, 0, 4, 3}, {0, 0, 1, 0}, {0, 0, 5, 0}, {0, 0, 1, 1}, {0, 0, 5, 1}, {0, 0, 1, 2}, {0, 0, 5, 2}, {0, 0, 1, 3}, {0, 0, 5, 3}};
|
||||
|
||||
for (uint32_t i = 0; i < 16u; i++) {
|
||||
EXPECT_EQ(expectedThreads[i].slice, threads[i].slice);
|
||||
EXPECT_EQ(expectedThreads[i].subslice, threads[i].subslice);
|
||||
EXPECT_EQ(expectedThreads[i].eu, threads[i].eu);
|
||||
EXPECT_EQ(expectedThreads[i].thread, threads[i].thread);
|
||||
}
|
||||
}
|
||||
|
||||
HWTEST2_F(L0HwHelperFusedEuTest, givenEu8To9Threads0To3BitmaskWhenGettingThreadsThenCorrectThreadsAreReturned, PlatformsWithFusedEus) {
|
||||
auto hwInfo = *NEO::defaultHwInfo.get();
|
||||
if (hwInfo.gtSystemInfo.MaxEuPerSubSlice <= 8) {
|
||||
GTEST_SKIP();
|
||||
}
|
||||
|
||||
auto &l0HwHelper = L0::L0HwHelper::get(hwInfo.platform.eRenderCoreFamily);
|
||||
|
||||
uint8_t data[] = {0x00, 0x00, 0x00, 0x00, 0x0f, 0x0f};
|
||||
auto threads = l0HwHelper.getThreadsFromAttentionBitmask(hwInfo, data, sizeof(data));
|
||||
|
||||
ASSERT_EQ(16u, threads.size());
|
||||
|
||||
ze_device_thread_t expectedThreads[] = {
|
||||
{0, 0, 8, 0}, {0, 0, 12, 0}, {0, 0, 8, 1}, {0, 0, 12, 1}, {0, 0, 8, 2}, {0, 0, 12, 2}, {0, 0, 8, 3}, {0, 0, 12, 3}, {0, 0, 9, 0}, {0, 0, 13, 0}, {0, 0, 9, 1}, {0, 0, 13, 1}, {0, 0, 9, 2}, {0, 0, 13, 2}, {0, 0, 9, 3}, {0, 0, 13, 3}};
|
||||
|
||||
for (uint32_t i = 0; i < 16u; i++) {
|
||||
EXPECT_EQ(expectedThreads[i].slice, threads[i].slice);
|
||||
@@ -500,7 +514,7 @@ HWTEST2_F(L0HwHelperFusedEuTest, givenBitmaskWithAttentionBitsForHalfOfThreadsWh
|
||||
} else {
|
||||
EXPECT_EQ(0u, threads[i].slice);
|
||||
EXPECT_EQ(subsliceIndex, threads[i].subslice);
|
||||
EXPECT_EQ(8u, threads[i].eu);
|
||||
EXPECT_EQ(4u, threads[i].eu);
|
||||
EXPECT_EQ(threadID, threads[i].thread);
|
||||
|
||||
subsliceIndex++;
|
||||
|
||||
Reference in New Issue
Block a user