L0 debug - fix windows bitmask decode

Keep threads created in EU range (0,7)

Signed-off-by: Yates, Brandon <brandon.yates@intel.com>
This commit is contained in:
Yates, Brandon
2022-10-27 14:39:14 +00:00
committed by Compute-Runtime-Automation
parent ac426f7fdc
commit 99ef6c499b
7 changed files with 86 additions and 35 deletions

View File

@@ -7,7 +7,8 @@
template <>
void L0HwHelperHw<Family>::getAttentionBitmaskForSingleThreads(const std::vector<EuThread::ThreadId> &threads, const NEO::HardwareInfo &hwInfo, std::unique_ptr<uint8_t[]> &bitmask, size_t &bitmaskSize) const {
const uint32_t numSubslicesPerSlice = hwInfo.gtSystemInfo.MaxSubSlicesSupported / hwInfo.gtSystemInfo.MaxSlicesSupported;
const uint32_t numSubslicesPerSlice = (hwInfo.gtSystemInfo.MaxEuPerSubSlice == 8) ? hwInfo.gtSystemInfo.MaxDualSubSlicesSupported / hwInfo.gtSystemInfo.MaxSlicesSupported : hwInfo.gtSystemInfo.MaxSubSlicesSupported / hwInfo.gtSystemInfo.MaxSlicesSupported;
const uint32_t numThreadsPerEu = (hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.EUCount);
const uint32_t bytesPerEu = alignUp(numThreadsPerEu, 8) / 8;
const uint32_t numEuPerSubslice = std::min(hwInfo.gtSystemInfo.MaxEuPerSubSlice, 8u);
@@ -24,11 +25,20 @@ void L0HwHelperHw<Family>::getAttentionBitmaskForSingleThreads(const std::vector
for (auto &thread : threads) {
uint8_t *sliceData = ptrOffset(bitmask.get(), threadsSizePerSlice * thread.slice);
uint8_t *subsliceData = ptrOffset(sliceData, numEuPerSubslice * bytesPerEu * thread.subslice);
uint8_t *euData;
if (hwInfo.gtSystemInfo.MaxEuPerSubSlice == 8) {
uint8_t *subsliceData = ptrOffset(sliceData, numEuPerSubslice * bytesPerEu * (thread.subslice / 2));
auto eu = thread.eu % eusPerRow;
auto dualEu = thread.subslice % 2;
euData = ptrOffset(subsliceData, bytesPerEu * (eu + dualEu * eusPerRow));
} else {
uint8_t *subsliceData = ptrOffset(sliceData, numEuPerSubslice * bytesPerEu * thread.subslice);
auto eu = thread.eu % eusPerRow;
auto dualEu = thread.eu / (numberOfRows * eusPerRow);
euData = ptrOffset(subsliceData, bytesPerEu * (eu + dualEu * eusPerRow));
}
auto eu = thread.eu % eusPerRow;
auto dualEu = thread.eu / (numberOfRows * eusPerRow);
uint8_t *euData = ptrOffset(subsliceData, bytesPerEu * (eu + dualEu * eusPerRow));
UNRECOVERABLE_IF(thread.thread > 7);
*euData |= (1 << thread.thread);
}
@@ -36,7 +46,8 @@ void L0HwHelperHw<Family>::getAttentionBitmaskForSingleThreads(const std::vector
template <>
std::vector<EuThread::ThreadId> L0HwHelperHw<Family>::getThreadsFromAttentionBitmask(const NEO::HardwareInfo &hwInfo, uint32_t tile, const uint8_t *bitmask, const size_t bitmaskSize) const {
const uint32_t numSubslicesPerSlice = hwInfo.gtSystemInfo.MaxSubSlicesSupported / hwInfo.gtSystemInfo.MaxSlicesSupported;
const uint32_t numSubslicesPerSlice = (hwInfo.gtSystemInfo.MaxEuPerSubSlice == 8) ? hwInfo.gtSystemInfo.MaxDualSubSlicesSupported / hwInfo.gtSystemInfo.MaxSlicesSupported : hwInfo.gtSystemInfo.MaxSubSlicesSupported / hwInfo.gtSystemInfo.MaxSlicesSupported;
const uint32_t numEuPerSubslice = std::min(hwInfo.gtSystemInfo.MaxEuPerSubSlice, 8u);
const uint32_t numThreadsPerEu = (hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.EUCount);
const uint32_t bytesPerEu = alignUp(numThreadsPerEu, 8) / 8;
@@ -66,8 +77,13 @@ std::vector<EuThread::ThreadId> L0HwHelperHw<Family>::getThreadsFromAttentionBit
std::bitset<8> bits(bitmask[offset]);
for (uint32_t i = 0; i < 8; i++) {
if (bits.test(i)) {
threads.emplace_back(tile, slice, subslice, euIndex + numEuPerSubslice * dualEu, i);
threads.emplace_back(tile, slice, subslice, euIndex + eusPerRow + numEuPerSubslice * dualEu, i);
if (hwInfo.gtSystemInfo.MaxEuPerSubSlice == 8) {
threads.emplace_back(tile, slice, (subslice * 2) + dualEu, euIndex, i);
threads.emplace_back(tile, slice, (subslice * 2) + dualEu, euIndex + eusPerRow, i);
} else {
threads.emplace_back(tile, slice, subslice, euIndex + numEuPerSubslice * dualEu, i);
threads.emplace_back(tile, slice, subslice, euIndex + eusPerRow + numEuPerSubslice * dualEu, i);
}
}
}
}