fix: add function to calculate number of threads per tg

Signed-off-by: Cencelewska, Katarzyna <katarzyna.cencelewska@intel.com>
This commit is contained in:
Cencelewska, Katarzyna
2023-06-12 11:41:13 +00:00
committed by Compute-Runtime-Automation
parent 987394b27c
commit 7cb3278eb3
16 changed files with 71 additions and 21 deletions

View File

@@ -317,10 +317,9 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
} }
} }
auto simdSize = kernelDescriptor.kernelAttributes.simdSize;
this->numThreadsPerThreadGroup = static_cast<uint32_t>((itemsInGroup + simdSize - 1u) / simdSize);
patchWorkgroupSizeInCrossThreadData(groupSizeX, groupSizeY, groupSizeZ); patchWorkgroupSizeInCrossThreadData(groupSizeX, groupSizeY, groupSizeZ);
auto simdSize = kernelDescriptor.kernelAttributes.simdSize;
auto remainderSimdLanes = itemsInGroup & (simdSize - 1u); auto remainderSimdLanes = itemsInGroup & (simdSize - 1u);
threadExecutionMask = static_cast<uint32_t>(maxNBitValue(remainderSimdLanes)); threadExecutionMask = static_cast<uint32_t>(maxNBitValue(remainderSimdLanes));
if (!threadExecutionMask) { if (!threadExecutionMask) {
@@ -328,6 +327,12 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
} }
evaluateIfRequiresGenerationOfLocalIdsByRuntime(kernelDescriptor); evaluateIfRequiresGenerationOfLocalIdsByRuntime(kernelDescriptor);
auto grfSize = this->module->getDevice()->getHwInfo().capabilityTable.grfSize;
auto &rootDeviceEnvironment = module->getDevice()->getNEODevice()->getRootDeviceEnvironment();
auto &gfxCoreHelper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
this->numThreadsPerThreadGroup = gfxCoreHelper.calculateNumThreadsPerThreadGroup(
simdSize, static_cast<uint32_t>(itemsInGroup), grfSize, kernelRequiresGenerationOfLocalIdsByRuntime);
if (kernelRequiresGenerationOfLocalIdsByRuntime) { if (kernelRequiresGenerationOfLocalIdsByRuntime) {
auto grfSize = this->module->getDevice()->getHwInfo().capabilityTable.grfSize; auto grfSize = this->module->getDevice()->getHwInfo().capabilityTable.grfSize;
uint32_t perThreadDataSizeForWholeThreadGroupNeeded = uint32_t perThreadDataSizeForWholeThreadGroupNeeded =

View File

@@ -29,10 +29,10 @@ inline size_t GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(
bool localIdsGenerationByRuntime, bool localIdsGenerationByRuntime,
bool inlineDataProgrammingRequired, bool inlineDataProgrammingRequired,
uint32_t requiredWorkgroupOrder) { uint32_t requiredWorkgroupOrder) {
auto localWorkSize = localWorkSizesIn[0] * localWorkSizesIn[1] * localWorkSizesIn[2]; auto localWorkSize = static_cast<uint32_t>(localWorkSizesIn[0] * localWorkSizesIn[1] * localWorkSizesIn[2]);
auto threadsPerWorkGroup = getThreadsPerWG(simd, localWorkSize); auto threadsPerWorkGroup = getThreadsPerWG(simd, localWorkSize);
walkerCmd->setThreadWidthCounterMaximum(static_cast<uint32_t>(threadsPerWorkGroup)); walkerCmd->setThreadWidthCounterMaximum(threadsPerWorkGroup);
walkerCmd->setThreadGroupIdXDimension(static_cast<uint32_t>(numWorkGroups[0])); walkerCmd->setThreadGroupIdXDimension(static_cast<uint32_t>(numWorkGroups[0]));
walkerCmd->setThreadGroupIdYDimension(static_cast<uint32_t>(numWorkGroups[1])); walkerCmd->setThreadGroupIdYDimension(static_cast<uint32_t>(numWorkGroups[1]));

View File

@@ -240,9 +240,10 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
kernel.getDynamicStateHeap(), device.getBindlessHeapsHelper(), kernel.getDynamicStateHeap(), device.getBindlessHeapsHelper(),
device.getRootDeviceEnvironment()); device.getRootDeviceEnvironment());
} }
auto &gfxCoreHelper = device.getGfxCoreHelper();
auto grfSize = kernel.getDescriptor().kernelAttributes.numGrfRequired;
auto localWorkItems = localWorkSize[0] * localWorkSize[1] * localWorkSize[2]; auto localWorkItems = localWorkSize[0] * localWorkSize[1] * localWorkSize[2];
auto threadsPerThreadGroup = static_cast<uint32_t>(getThreadsPerWG(simd, localWorkItems)); auto threadsPerThreadGroup = gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkItems), grfSize, localIdsGenerationByRuntime);
uint32_t sizeCrossThreadData = kernel.getCrossThreadDataSize(); uint32_t sizeCrossThreadData = kernel.getCrossThreadDataSize();

View File

@@ -729,7 +729,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, givenPassInlin
// only X is present // only X is present
auto sizePerThreadData = getPerThreadSizeLocalIDs(simd, sizeGrf); auto sizePerThreadData = getPerThreadSizeLocalIDs(simd, sizeGrf);
sizePerThreadData = std::max(sizePerThreadData, sizeGrf); sizePerThreadData = std::max(sizePerThreadData, sizeGrf);
size_t perThreadTotalDataSize = getThreadsPerWG(simd, lws[0]) * sizePerThreadData; size_t perThreadTotalDataSize = getThreadsPerWG(simd, static_cast<uint32_t>(lws[0])) * sizePerThreadData;
uint32_t expectedIndirectDataLength = alignUp(static_cast<uint32_t>(perThreadTotalDataSize), COMPUTE_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE); uint32_t expectedIndirectDataLength = alignUp(static_cast<uint32_t>(perThreadTotalDataSize), COMPUTE_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
EXPECT_EQ(expectedIndirectDataLength, walker->getIndirectDataLength()); EXPECT_EQ(expectedIndirectDataLength, walker->getIndirectDataLength());
@@ -832,13 +832,13 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, givenPassInlin
// only X is present // only X is present
uint32_t localIdSizePerThread = getPerThreadSizeLocalIDs(simd, sizeGrf); uint32_t localIdSizePerThread = getPerThreadSizeLocalIDs(simd, sizeGrf);
localIdSizePerThread = std::max(localIdSizePerThread, sizeGrf); localIdSizePerThread = std::max(localIdSizePerThread, sizeGrf);
auto sizePerThreadData = getThreadsPerWG(simd, lws[0]) * localIdSizePerThread; auto sizePerThreadData = getThreadsPerWG(simd, static_cast<uint32_t>(lws[0])) * localIdSizePerThread;
auto crossThreadDataSize = kernel->mockKernel->getCrossThreadDataSize(); auto crossThreadDataSize = kernel->mockKernel->getCrossThreadDataSize();
crossThreadDataSize -= std::min(static_cast<uint32_t>(sizeof(INLINE_DATA)), crossThreadDataSize); crossThreadDataSize -= std::min(static_cast<uint32_t>(sizeof(INLINE_DATA)), crossThreadDataSize);
// second GRF in indirect // second GRF in indirect
uint32_t expectedIndirectDataLength = static_cast<uint32_t>(sizePerThreadData + crossThreadDataSize); uint32_t expectedIndirectDataLength = sizePerThreadData + crossThreadDataSize;
expectedIndirectDataLength = alignUp(expectedIndirectDataLength, COMPUTE_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE); expectedIndirectDataLength = alignUp(expectedIndirectDataLength, COMPUTE_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
EXPECT_EQ(expectedIndirectDataLength, walker->getIndirectDataLength()); EXPECT_EQ(expectedIndirectDataLength, walker->getIndirectDataLength());

View File

@@ -354,9 +354,9 @@ void EncodeDispatchKernel<Family>::encodeThreadData(WALKER_TYPE &walkerCmd,
walkerCmd.setSimdSize(getSimdConfig<WALKER_TYPE>(simd)); walkerCmd.setSimdSize(getSimdConfig<WALKER_TYPE>(simd));
auto localWorkSize = workGroupSizes[0] * workGroupSizes[1] * workGroupSizes[2]; auto localWorkSize = static_cast<uint32_t>(workGroupSizes[0] * workGroupSizes[1] * workGroupSizes[2]);
if (threadsPerThreadGroup == 0) { if (threadsPerThreadGroup == 0) {
threadsPerThreadGroup = static_cast<uint32_t>(getThreadsPerWG(simd, localWorkSize)); threadsPerThreadGroup = getThreadsPerWG(simd, localWorkSize);
} }
walkerCmd.setThreadWidthCounterMaximum(threadsPerThreadGroup); walkerCmd.setThreadWidthCounterMaximum(threadsPerThreadGroup);

View File

@@ -144,6 +144,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, CFEOverDispatchControl, -1, "Set Over Dispatch C
DECLARE_DEBUG_VARIABLE(int32_t, CFELargeGRFThreadAdjustDisable, -1, "Set Large GRF thread adjust Disable field in CFE_STATE, -1 - do not set") DECLARE_DEBUG_VARIABLE(int32_t, CFELargeGRFThreadAdjustDisable, -1, "Set Large GRF thread adjust Disable field in CFE_STATE, -1 - do not set")
DECLARE_DEBUG_VARIABLE(int32_t, OverrideNumComputeUnitsForScratch, -1, "Override number of compute units used for scratch size calculation") DECLARE_DEBUG_VARIABLE(int32_t, OverrideNumComputeUnitsForScratch, -1, "Override number of compute units used for scratch size calculation")
DECLARE_DEBUG_VARIABLE(int32_t, ForceThreadGroupDispatchSize, -1, "Set ThreadGroupDispatchSize in INTERFACE_DESCRIPTOR_DATA, -1 - default, 0 - TG size 8, 1 - TG size 4, 2 - TG size 2, 3 - Reserved") DECLARE_DEBUG_VARIABLE(int32_t, ForceThreadGroupDispatchSize, -1, "Set ThreadGroupDispatchSize in INTERFACE_DESCRIPTOR_DATA, -1 - default, 0 - TG size 8, 1 - TG size 4, 2 - TG size 2, 3 - Reserved")
DECLARE_DEBUG_VARIABLE(int32_t, ForceNumberOfThreadsInGpgpuThreadGroup, -1, "-1 - default, set NumberOfThreadsInGpgpuThreadGroup in INTERFACE_DESCRIPTOR_DATA")
DECLARE_DEBUG_VARIABLE(int32_t, ForceStatelessL1CachingPolicy, -1, "-1: default, >=0 : program value for stateless L1 caching") DECLARE_DEBUG_VARIABLE(int32_t, ForceStatelessL1CachingPolicy, -1, "-1: default, >=0 : program value for stateless L1 caching")
DECLARE_DEBUG_VARIABLE(int32_t, ForceMemoryBankIndexOverride, -1, "-1: default, 0: disable, 1:enable, Force index=1 of memory bank for XEHP") DECLARE_DEBUG_VARIABLE(int32_t, ForceMemoryBankIndexOverride, -1, "-1: default, 0: disable, 1:enable, Force index=1 of memory bank for XEHP")
DECLARE_DEBUG_VARIABLE(int32_t, EnablePrivateScratchSlot1, -1, "-1: default, 0: disable, 1: enable Allows using private scratch space") DECLARE_DEBUG_VARIABLE(int32_t, EnablePrivateScratchSlot1, -1, "-1: default, 0: disable, 1: enable Allows using private scratch space")

View File

@@ -1,5 +1,5 @@
/* /*
* Copyright (C) 2018-2022 Intel Corporation * Copyright (C) 2018-2023 Intel Corporation
* *
* SPDX-License-Identifier: MIT * SPDX-License-Identifier: MIT
* *
@@ -42,7 +42,7 @@ LocalIDHelper::LocalIDHelper() {
LocalIDHelper LocalIDHelper::initializer; LocalIDHelper LocalIDHelper::initializer;
void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize) { void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize) {
auto threadsPerWorkGroup = static_cast<uint16_t>(getThreadsPerWG(simd, localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2])); auto threadsPerWorkGroup = static_cast<uint16_t>(getThreadsPerWG(simd, static_cast<uint32_t>(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2])));
bool useLayoutForImages = isImageOnlyKernel && isCompatibleWithLayoutForImages(localWorkgroupSize, dimensionsOrder, simd); bool useLayoutForImages = isImageOnlyKernel && isCompatibleWithLayoutForImages(localWorkgroupSize, dimensionsOrder, simd);
if (useLayoutForImages) { if (useLayoutForImages) {
generateLocalIDsWithLayoutForImages(buffer, localWorkgroupSize, simd); generateLocalIDsWithLayoutForImages(buffer, localWorkgroupSize, simd);

View File

@@ -168,6 +168,7 @@ class GfxCoreHelper {
virtual bool isChipsetUniqueUUIDSupported() const = 0; virtual bool isChipsetUniqueUUIDSupported() const = 0;
virtual bool isTimestampShiftRequired() const = 0; virtual bool isTimestampShiftRequired() const = 0;
virtual bool isRelaxedOrderingSupported() const = 0; virtual bool isRelaxedOrderingSupported() const = 0;
virtual uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalGeneration) const = 0;
static bool isWorkaroundRequired(uint32_t lowestSteppingWithBug, uint32_t steppingWithFix, const HardwareInfo &hwInfo, const ProductHelper &productHelper); static bool isWorkaroundRequired(uint32_t lowestSteppingWithBug, uint32_t steppingWithFix, const HardwareInfo &hwInfo, const ProductHelper &productHelper);
virtual ~GfxCoreHelper() = default; virtual ~GfxCoreHelper() = default;
@@ -380,6 +381,7 @@ class GfxCoreHelperHw : public GfxCoreHelper {
bool isChipsetUniqueUUIDSupported() const override; bool isChipsetUniqueUUIDSupported() const override;
bool isTimestampShiftRequired() const override; bool isTimestampShiftRequired() const override;
bool isRelaxedOrderingSupported() const override; bool isRelaxedOrderingSupported() const override;
uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalGeneration) const override;
~GfxCoreHelperHw() override = default; ~GfxCoreHelperHw() override = default;

View File

@@ -16,6 +16,7 @@
#include "shared/source/helpers/constants.h" #include "shared/source/helpers/constants.h"
#include "shared/source/helpers/gfx_core_helper.h" #include "shared/source/helpers/gfx_core_helper.h"
#include "shared/source/helpers/hw_info.h" #include "shared/source/helpers/hw_info.h"
#include "shared/source/helpers/local_id_gen.h"
#include "shared/source/helpers/pipe_control_args.h" #include "shared/source/helpers/pipe_control_args.h"
#include "shared/source/helpers/timestamp_packet.h" #include "shared/source/helpers/timestamp_packet.h"
#include "shared/source/memory_manager/allocation_properties.h" #include "shared/source/memory_manager/allocation_properties.h"
@@ -685,4 +686,12 @@ template <typename GfxFamily>
uint32_t GfxCoreHelperHw<GfxFamily>::getMinimalGrfSize() const { uint32_t GfxCoreHelperHw<GfxFamily>::getMinimalGrfSize() const {
return 128u; return 128u;
} }
template <typename GfxFamily>
uint32_t GfxCoreHelperHw<GfxFamily>::calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalGeneration) const {
if (DebugManager.flags.ForceNumberOfThreadsInGpgpuThreadGroup.get() != -1) {
return DebugManager.flags.ForceNumberOfThreadsInGpgpuThreadGroup.get();
}
return getThreadsPerWG(simd, totalWorkItems);
}
} // namespace NEO } // namespace NEO

View File

@@ -16,7 +16,7 @@ inline uint32_t getGRFsPerThread(uint32_t simd, uint32_t grfSize) {
return (simd == 32 && grfSize == 32) ? 2 : 1; return (simd == 32 && grfSize == 32) ? 2 : 1;
} }
inline size_t getThreadsPerWG(uint32_t simd, size_t lws) { inline uint32_t getThreadsPerWG(uint32_t simd, uint32_t lws) {
auto result = lws + simd - 1; auto result = lws + simd - 1;
// Original logic: // Original logic:

View File

@@ -20,7 +20,7 @@ struct PerThreadDataHelper {
uint32_t grfSize, uint32_t grfSize,
uint32_t numChannels, uint32_t numChannels,
size_t localWorkSize) { size_t localWorkSize) {
return getThreadsPerWG(simd, localWorkSize) * getPerThreadSizeLocalIDs(simd, grfSize, numChannels); return getThreadsPerWG(simd, static_cast<uint32_t>(localWorkSize)) * getPerThreadSizeLocalIDs(simd, grfSize, numChannels);
} }
}; // namespace PerThreadDataHelper }; // namespace PerThreadDataHelper
} // namespace NEO } // namespace NEO

View File

@@ -45,7 +45,7 @@ LocalIDHelper LocalIDHelper::initializer;
// traditional function to generate local IDs // traditional function to generate local IDs
void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize) { void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize) {
auto threadsPerWorkGroup = static_cast<uint16_t>(getThreadsPerWG(simd, localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2])); auto threadsPerWorkGroup = static_cast<uint16_t>(getThreadsPerWG(simd, static_cast<uint32_t>(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2])));
bool useLayoutForImages = isImageOnlyKernel && isCompatibleWithLayoutForImages(localWorkgroupSize, dimensionsOrder, simd); bool useLayoutForImages = isImageOnlyKernel && isCompatibleWithLayoutForImages(localWorkgroupSize, dimensionsOrder, simd);
if (useLayoutForImages) { if (useLayoutForImages) {
generateLocalIDsWithLayoutForImages(buffer, localWorkgroupSize, simd); generateLocalIDsWithLayoutForImages(buffer, localWorkgroupSize, simd);

View File

@@ -1,5 +1,5 @@
/* /*
* Copyright (C) 2022 Intel Corporation * Copyright (C) 2022-2023 Intel Corporation
* *
* SPDX-License-Identifier: MIT * SPDX-License-Identifier: MIT
* *
@@ -33,9 +33,9 @@ std::unique_lock<std::mutex> LocalIdsCache::lock() {
} }
size_t LocalIdsCache::getLocalIdsSizeForGroup(const Vec3<uint16_t> &group) const { size_t LocalIdsCache::getLocalIdsSizeForGroup(const Vec3<uint16_t> &group) const {
const auto numElementsInGroup = Math::computeTotalElementsCount({group[0], group[1], group[2]}); const auto numElementsInGroup = static_cast<uint32_t>(Math::computeTotalElementsCount({group[0], group[1], group[2]}));
const auto numberOfThreads = getThreadsPerWG(simdSize, numElementsInGroup); const auto numberOfThreads = getThreadsPerWG(simdSize, numElementsInGroup);
return numberOfThreads * static_cast<size_t>(localIdsSizePerThread); return static_cast<size_t>(numberOfThreads * localIdsSizePerThread);
} }
size_t LocalIdsCache::getLocalIdsSizePerThread() const { size_t LocalIdsCache::getLocalIdsSizePerThread() const {

View File

@@ -534,4 +534,5 @@ PrintGlobalTimestampInNs = 0
EnableDeviceStateVerification = -1 EnableDeviceStateVerification = -1
VfBarResourceAllocationWa = 1 VfBarResourceAllocationWa = 1
EnableDynamicPostSyncAllocLayout = -1 EnableDynamicPostSyncAllocLayout = -1
ForceNumberOfThreadsInGpgpuThreadGroup = -1
# Please don't edit below this line # Please don't edit below this line

View File

@@ -1568,3 +1568,34 @@ HWTEST_F(GfxCoreHelperTest, givenNumGrfAndSimdSizeWhenAdjustingMaxWorkGroupSizeT
numGrfRequired = GrfConfig::DefaultGrfNumber; numGrfRequired = GrfConfig::DefaultGrfNumber;
EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.adjustMaxWorkGroupSize(numGrfRequired, simdSize, defaultMaxGroupSize)); EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.adjustMaxWorkGroupSize(numGrfRequired, simdSize, defaultMaxGroupSize));
} }
HWTEST2_F(GfxCoreHelperTest, givenParamsWhenCalculateNumThreadsPerThreadGroupThenMethodReturnProperValue, IsAtMostXeHpcCore) {
auto &gfxCoreHelper = getHelper<GfxCoreHelper>();
std::array<std::array<uint32_t, 3>, 8> values = {{
{32u, 32u, 1u}, // SIMT Size, totalWorkItems, Max Num of threads
{32u, 64u, 2u},
{32u, 128u, 4u},
{32u, 1024u, 32u},
{16u, 32u, 2u},
{16u, 64u, 4u},
{16u, 128u, 8u},
{16u, 1024u, 64u},
}};
for (auto &[simtSize, totalWgSize, expectedNumThreadsPerThreadGroup] : values) {
EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(simtSize, totalWgSize, 32u, true));
}
}
HWTEST_F(GfxCoreHelperTest, givenFlagForceNumberOfThreadsInGpgpuThreadGroupWhenCalculateNumThreadsPerThreadGroupThenMethodReturnProperValue) {
DebugManagerStateRestore dbgRestore;
const auto &gfxCoreHelper = getHelper<GfxCoreHelper>();
uint32_t expectedNumThreadsPerThreadGroup = 10u;
DebugManager.flags.ForceNumberOfThreadsInGpgpuThreadGroup.set(expectedNumThreadsPerThreadGroup);
EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(32u, 100u, 32u, true));
expectedNumThreadsPerThreadGroup = 20u;
DebugManager.flags.ForceNumberOfThreadsInGpgpuThreadGroup.set(expectedNumThreadsPerThreadGroup);
EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(32u, 100u, 32u, true));
}

View File

@@ -40,7 +40,7 @@ HWTEST_F(LocalIdTests, GivenSimd32AndNon32GrfSizeWhenGettingGrfsPerThreadThenTwo
} }
TEST(LocalID, GivenSimd32AndLws33WhenGettingThreadsPerWorkgroupThenTwoIsReturned) { TEST(LocalID, GivenSimd32AndLws33WhenGettingThreadsPerWorkgroupThenTwoIsReturned) {
size_t lws = 33; uint32_t lws = 33;
uint32_t simd = 32; uint32_t simd = 32;
EXPECT_EQ(2u, getThreadsPerWG(simd, lws)); EXPECT_EQ(2u, getThreadsPerWG(simd, lws));
} }
@@ -313,7 +313,7 @@ HWTEST_P(LocalIDFixture, WhenWalkOrderIsZyxThenProperLocalIdsAreGenerated) {
} }
TEST_P(LocalIDFixture, WhenThreadsPerWgAreGeneratedThenSizeCalculationAreCorrect) { TEST_P(LocalIDFixture, WhenThreadsPerWgAreGeneratedThenSizeCalculationAreCorrect) {
auto workItems = localWorkSizeX * localWorkSizeY * localWorkSizeZ; auto workItems = static_cast<uint32_t>(localWorkSizeX * localWorkSizeY * localWorkSizeZ);
auto sizeTotalPerThreadData = getThreadsPerWG(simd, workItems) * getPerThreadSizeLocalIDs(simd, grfSize); auto sizeTotalPerThreadData = getThreadsPerWG(simd, workItems) * getPerThreadSizeLocalIDs(simd, grfSize);
// Should be multiple of GRFs // Should be multiple of GRFs