mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-24 04:12:57 +08:00
fix: Correct logic for SIMD1
- For calculating number of threads per workgroup, treat simd 1 as it was simd 32 - Correct logic of calculating space for per thread data for simd 1 - Minor: unit tests refactor - Corrected naming Related-To: NEO-8261 Signed-off-by: Kacper Nowak <kacper.nowak@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
b053e9348e
commit
fc099ead2e
@@ -12,6 +12,7 @@
|
|||||||
#include "shared/source/helpers/bindless_heaps_helper.h"
|
#include "shared/source/helpers/bindless_heaps_helper.h"
|
||||||
#include "shared/source/helpers/gfx_core_helper.h"
|
#include "shared/source/helpers/gfx_core_helper.h"
|
||||||
#include "shared/source/helpers/local_memory_access_modes.h"
|
#include "shared/source/helpers/local_memory_access_modes.h"
|
||||||
|
#include "shared/source/helpers/per_thread_data.h"
|
||||||
#include "shared/source/helpers/ray_tracing_helper.h"
|
#include "shared/source/helpers/ray_tracing_helper.h"
|
||||||
#include "shared/source/indirect_heap/indirect_heap.h"
|
#include "shared/source/indirect_heap/indirect_heap.h"
|
||||||
#include "shared/source/kernel/implicit_args.h"
|
#include "shared/source/kernel/implicit_args.h"
|
||||||
@@ -295,19 +296,36 @@ TEST_F(SetKernelArgCacheTest, givenValidBufferArgumentWhenSetMultipleTimesThenSe
|
|||||||
|
|
||||||
using KernelImpSetGroupSizeTest = Test<DeviceFixture>;
|
using KernelImpSetGroupSizeTest = Test<DeviceFixture>;
|
||||||
|
|
||||||
TEST_F(KernelImpSetGroupSizeTest, WhenCalculatingLocalIdsThenGrfSizeIsTakenFromCapabilityTable) {
|
TEST_F(KernelImpSetGroupSizeTest, givenLocalIdGenerationByRuntimeEnabledWhenSettingGroupSizeThenProperlyGenerateLocalIds) {
|
||||||
Mock<KernelImp> mockKernel;
|
Mock<KernelImp> mockKernel;
|
||||||
Mock<Module> mockModule(this->device, nullptr);
|
Mock<Module> mockModule(this->device, nullptr);
|
||||||
mockKernel.descriptor.kernelAttributes.simdSize = 1;
|
mockKernel.descriptor.kernelAttributes.simdSize = 1;
|
||||||
|
mockKernel.kernelRequiresGenerationOfLocalIdsByRuntime = true; // although it is enabled for SIMD 1, make sure it is enforced
|
||||||
mockKernel.descriptor.kernelAttributes.numLocalIdChannels = 3;
|
mockKernel.descriptor.kernelAttributes.numLocalIdChannels = 3;
|
||||||
mockKernel.module = &mockModule;
|
mockKernel.module = &mockModule;
|
||||||
auto grfSize = mockModule.getDevice()->getHwInfo().capabilityTable.grfSize;
|
auto grfSize = mockModule.getDevice()->getHwInfo().capabilityTable.grfSize;
|
||||||
uint32_t groupSize[3] = {2, 3, 5};
|
uint32_t groupSize[3] = {2, 3, 5};
|
||||||
auto ret = mockKernel.setGroupSize(groupSize[0], groupSize[1], groupSize[2]);
|
auto ret = mockKernel.setGroupSize(groupSize[0], groupSize[1], groupSize[2]);
|
||||||
EXPECT_EQ(ZE_RESULT_SUCCESS, ret);
|
EXPECT_EQ(ZE_RESULT_SUCCESS, ret);
|
||||||
EXPECT_EQ(groupSize[0] * groupSize[1] * groupSize[2], mockKernel.numThreadsPerThreadGroup);
|
|
||||||
EXPECT_EQ(grfSize * groupSize[0] * groupSize[1] * groupSize[2], mockKernel.perThreadDataSizeForWholeThreadGroup);
|
const auto &gfxHelper = mockModule.getDevice()->getGfxCoreHelper();
|
||||||
ASSERT_LE(grfSize * groupSize[0] * groupSize[1] * groupSize[2], mockKernel.perThreadDataSizeForWholeThreadGroup);
|
auto numThreadsPerTG = gfxHelper.calculateNumThreadsPerThreadGroup(
|
||||||
|
mockKernel.descriptor.kernelAttributes.simdSize,
|
||||||
|
groupSize[0] * groupSize[1] * groupSize[2],
|
||||||
|
grfSize,
|
||||||
|
mockKernel.kernelRequiresGenerationOfLocalIdsByRuntime);
|
||||||
|
auto perThreadDataSizeForWholeTGNeeded =
|
||||||
|
static_cast<uint32_t>(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(
|
||||||
|
mockKernel.descriptor.kernelAttributes.simdSize,
|
||||||
|
grfSize,
|
||||||
|
mockKernel.descriptor.kernelAttributes.numLocalIdChannels,
|
||||||
|
groupSize[0] * groupSize[1] * groupSize[2],
|
||||||
|
!mockKernel.kernelRequiresGenerationOfLocalIdsByRuntime,
|
||||||
|
gfxHelper));
|
||||||
|
|
||||||
|
EXPECT_EQ(numThreadsPerTG, mockKernel.getNumThreadsPerThreadGroup());
|
||||||
|
EXPECT_EQ((perThreadDataSizeForWholeTGNeeded / numThreadsPerTG), mockKernel.perThreadDataSize);
|
||||||
|
|
||||||
using LocalIdT = unsigned short;
|
using LocalIdT = unsigned short;
|
||||||
auto threadOffsetInLocalIds = grfSize / sizeof(LocalIdT);
|
auto threadOffsetInLocalIds = grfSize / sizeof(LocalIdT);
|
||||||
auto generatedLocalIds = reinterpret_cast<LocalIdT *>(mockKernel.perThreadDataForWholeThreadGroup);
|
auto generatedLocalIds = reinterpret_cast<LocalIdT *>(mockKernel.perThreadDataForWholeThreadGroup);
|
||||||
@@ -335,7 +353,6 @@ TEST_F(KernelImpSetGroupSizeTest, givenLocalIdGenerationByRuntimeDisabledWhenSet
|
|||||||
uint32_t groupSize[3] = {2, 3, 5};
|
uint32_t groupSize[3] = {2, 3, 5};
|
||||||
auto ret = mockKernel.setGroupSize(groupSize[0], groupSize[1], groupSize[2]);
|
auto ret = mockKernel.setGroupSize(groupSize[0], groupSize[1], groupSize[2]);
|
||||||
EXPECT_EQ(ZE_RESULT_SUCCESS, ret);
|
EXPECT_EQ(ZE_RESULT_SUCCESS, ret);
|
||||||
EXPECT_EQ(groupSize[0] * groupSize[1] * groupSize[2], mockKernel.numThreadsPerThreadGroup);
|
|
||||||
EXPECT_EQ(0u, mockKernel.perThreadDataSizeForWholeThreadGroup);
|
EXPECT_EQ(0u, mockKernel.perThreadDataSizeForWholeThreadGroup);
|
||||||
EXPECT_EQ(0u, mockKernel.perThreadDataSize);
|
EXPECT_EQ(0u, mockKernel.perThreadDataSize);
|
||||||
EXPECT_EQ(nullptr, mockKernel.perThreadDataForWholeThreadGroup);
|
EXPECT_EQ(nullptr, mockKernel.perThreadDataForWholeThreadGroup);
|
||||||
|
|||||||
@@ -7,17 +7,21 @@
|
|||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#include "shared/source/helpers/simd_helper.h"
|
||||||
|
|
||||||
#include <array>
|
#include <array>
|
||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
|
|
||||||
namespace NEO {
|
namespace NEO {
|
||||||
class GfxCoreHelper;
|
class GfxCoreHelper;
|
||||||
inline uint32_t getGRFsPerThread(uint32_t simd, uint32_t grfSize) {
|
inline uint32_t getNumGrfsPerLocalIdCoordinate(uint32_t simd, uint32_t grfSize) {
|
||||||
return (simd == 32 && grfSize == 32) ? 2 : 1;
|
return (simd == 32 && grfSize == 32) ? 2 : 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline uint32_t getThreadsPerWG(uint32_t simd, uint32_t lws) {
|
inline uint32_t getThreadsPerWG(uint32_t simd, uint32_t lws) {
|
||||||
|
if (isSimd1(simd))
|
||||||
|
simd = 32;
|
||||||
auto result = lws + simd - 1;
|
auto result = lws + simd - 1;
|
||||||
|
|
||||||
// Original logic:
|
// Original logic:
|
||||||
@@ -27,17 +31,17 @@ inline uint32_t getThreadsPerWG(uint32_t simd, uint32_t lws) {
|
|||||||
? 5
|
? 5
|
||||||
: simd == 16
|
: simd == 16
|
||||||
? 4
|
? 4
|
||||||
: simd == 8
|
: 3; // for SIMD 8
|
||||||
? 3
|
|
||||||
: 0;
|
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline uint32_t getPerThreadSizeLocalIDs(uint32_t simd, uint32_t grfSize, uint32_t numChannels = 3) {
|
inline uint32_t getPerThreadSizeLocalIDs(uint32_t simd, uint32_t grfSize, uint32_t numChannels = 3) {
|
||||||
auto numGRFSPerThread = getGRFsPerThread(simd, grfSize);
|
if (isSimd1(simd)) {
|
||||||
uint32_t returnSize = numGRFSPerThread * grfSize * (simd == 1 ? 1u : numChannels);
|
return grfSize;
|
||||||
returnSize = std::max(returnSize, grfSize);
|
}
|
||||||
|
auto numGRFSPerLocalIdCoord = getNumGrfsPerLocalIdCoordinate(simd, grfSize);
|
||||||
|
uint32_t returnSize = numGRFSPerLocalIdCoord * grfSize * numChannels;
|
||||||
return returnSize;
|
return returnSize;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -8,6 +8,7 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
#include "shared/source/helpers/gfx_core_helper.h"
|
#include "shared/source/helpers/gfx_core_helper.h"
|
||||||
#include "shared/source/helpers/local_id_gen.h"
|
#include "shared/source/helpers/local_id_gen.h"
|
||||||
|
#include "shared/source/helpers/simd_helper.h"
|
||||||
|
|
||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
@@ -23,7 +24,11 @@ struct PerThreadDataHelper {
|
|||||||
size_t localWorkSize,
|
size_t localWorkSize,
|
||||||
bool isHwLocalIdGeneration,
|
bool isHwLocalIdGeneration,
|
||||||
const GfxCoreHelper &gfxCoreHelper) {
|
const GfxCoreHelper &gfxCoreHelper) {
|
||||||
return gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkSize), grfSize, isHwLocalIdGeneration) * getPerThreadSizeLocalIDs(simd, grfSize, numChannels);
|
auto perThreadSizeLocalIDs = static_cast<size_t>(getPerThreadSizeLocalIDs(simd, grfSize, numChannels));
|
||||||
|
if (isSimd1(simd)) {
|
||||||
|
return perThreadSizeLocalIDs * localWorkSize;
|
||||||
|
}
|
||||||
|
return perThreadSizeLocalIDs * gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkSize), grfSize, isHwLocalIdGeneration);
|
||||||
}
|
}
|
||||||
}; // namespace PerThreadDataHelper
|
}; // namespace PerThreadDataHelper
|
||||||
} // namespace NEO
|
} // namespace NEO
|
||||||
|
|||||||
@@ -23,22 +23,27 @@ using LocalIdTests = ::testing::Test;
|
|||||||
|
|
||||||
HWTEST_F(LocalIdTests, GivenSimd8WhenGettingGrfsPerThreadThenOneIsReturned) {
|
HWTEST_F(LocalIdTests, GivenSimd8WhenGettingGrfsPerThreadThenOneIsReturned) {
|
||||||
uint32_t simd = 8;
|
uint32_t simd = 8;
|
||||||
EXPECT_EQ(1u, getGRFsPerThread(simd, 32));
|
EXPECT_EQ(1u, getNumGrfsPerLocalIdCoordinate(simd, 32));
|
||||||
}
|
}
|
||||||
|
|
||||||
HWTEST_F(LocalIdTests, GivenSimd16WhenGettingGrfsPerThreadThenOneIsReturned) {
|
HWTEST_F(LocalIdTests, GivenSimd16WhenGettingGrfsPerThreadThenOneIsReturned) {
|
||||||
uint32_t simd = 16;
|
uint32_t simd = 16;
|
||||||
EXPECT_EQ(1u, getGRFsPerThread(simd, 32));
|
EXPECT_EQ(1u, getNumGrfsPerLocalIdCoordinate(simd, 32));
|
||||||
}
|
}
|
||||||
|
|
||||||
HWTEST_F(LocalIdTests, GivenSimd32WhenGettingGrfsPerThreadThenTwoIsReturned) {
|
HWTEST_F(LocalIdTests, GivenSimd32WhenGettingGrfsPerThreadThenTwoIsReturned) {
|
||||||
uint32_t simd = 32;
|
uint32_t simd = 32;
|
||||||
EXPECT_EQ(2u, getGRFsPerThread(simd, 32));
|
EXPECT_EQ(2u, getNumGrfsPerLocalIdCoordinate(simd, 32));
|
||||||
|
}
|
||||||
|
|
||||||
|
HWTEST_F(LocalIdTests, GivenSimd1WhenGettingGrfsPerThreadThenOneIsReturned) {
|
||||||
|
uint32_t simd = 1;
|
||||||
|
EXPECT_EQ(1u, getNumGrfsPerLocalIdCoordinate(simd, 32));
|
||||||
}
|
}
|
||||||
|
|
||||||
HWTEST_F(LocalIdTests, GivenSimd32AndNon32GrfSizeWhenGettingGrfsPerThreadThenTwoIsReturned) {
|
HWTEST_F(LocalIdTests, GivenSimd32AndNon32GrfSizeWhenGettingGrfsPerThreadThenTwoIsReturned) {
|
||||||
uint32_t simd = 32;
|
uint32_t simd = 32;
|
||||||
EXPECT_EQ(1u, getGRFsPerThread(simd, 33));
|
EXPECT_EQ(1u, getNumGrfsPerLocalIdCoordinate(simd, 33));
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(LocalID, GivenSimd32AndLws33WhenGettingThreadsPerWorkgroupThenTwoIsReturned) {
|
TEST(LocalID, GivenSimd32AndLws33WhenGettingThreadsPerWorkgroupThenTwoIsReturned) {
|
||||||
@@ -78,6 +83,26 @@ TEST(LocalID, GivenSimd1WhenGettingPerThreadSizeLocalIdsThenValueIsEqualGrfSize)
|
|||||||
EXPECT_EQ(grfSize, getPerThreadSizeLocalIDs(simd, grfSize));
|
EXPECT_EQ(grfSize, getPerThreadSizeLocalIDs(simd, grfSize));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(LocalID, WhenThreadsPerWgAreGeneratedThenCalculationsAreCorrect) {
|
||||||
|
const auto lws = 33u;
|
||||||
|
for (const auto &simd : {1u, 8u, 16u, 32u}) {
|
||||||
|
switch (simd) {
|
||||||
|
case 1u: // treat SIMD 1 as SIMD 32 in such case
|
||||||
|
case 32u:
|
||||||
|
EXPECT_EQ((lws + std::max(32u, simd) - 1) >> 5, getThreadsPerWG(simd, lws));
|
||||||
|
break;
|
||||||
|
case 8u:
|
||||||
|
EXPECT_EQ((lws + simd - 1) >> 3, getThreadsPerWG(simd, lws));
|
||||||
|
break;
|
||||||
|
case 16u:
|
||||||
|
EXPECT_EQ((lws + simd - 1) >> 4, getThreadsPerWG(simd, lws));
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
TEST(LocalIdTest, givenVariadicGrfSizeWhenLocalSizesAreEmittedThenUseFullRowSize) {
|
TEST(LocalIdTest, givenVariadicGrfSizeWhenLocalSizesAreEmittedThenUseFullRowSize) {
|
||||||
auto localIdsPtr = allocateAlignedMemory(3 * 64u, MemoryConstants::cacheLineSize);
|
auto localIdsPtr = allocateAlignedMemory(3 * 64u, MemoryConstants::cacheLineSize);
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user