mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-24 21:18:24 +08:00
fix: to always use grfs count in calculateNumThreadsPerThreadGroup
grf size != grf count Related-To: GSD-8437 Signed-off-by: Katarzyna Cencelewska <katarzyna.cencelewska@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
df54d67f40
commit
da7b03dd15
@@ -377,17 +377,17 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
|
||||
}
|
||||
evaluateIfRequiresGenerationOfLocalIdsByRuntime(kernelDescriptor);
|
||||
|
||||
auto grfSize = this->module->getDevice()->getHwInfo().capabilityTable.grfSize;
|
||||
auto grfCount = kernelDescriptor.kernelAttributes.numGrfRequired;
|
||||
auto &rootDeviceEnvironment = module->getDevice()->getNEODevice()->getRootDeviceEnvironment();
|
||||
auto &gfxCoreHelper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
|
||||
this->numThreadsPerThreadGroup = gfxCoreHelper.calculateNumThreadsPerThreadGroup(
|
||||
simdSize, static_cast<uint32_t>(itemsInGroup), grfSize, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);
|
||||
simdSize, static_cast<uint32_t>(itemsInGroup), grfCount, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);
|
||||
|
||||
if (kernelRequiresGenerationOfLocalIdsByRuntime) {
|
||||
auto grfSize = this->module->getDevice()->getHwInfo().capabilityTable.grfSize;
|
||||
uint32_t perThreadDataSizeForWholeThreadGroupNeeded =
|
||||
static_cast<uint32_t>(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(
|
||||
simdSize, grfSize, numChannels, itemsInGroup, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment));
|
||||
simdSize, grfSize, grfCount, numChannels, itemsInGroup, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment));
|
||||
if (perThreadDataSizeForWholeThreadGroupNeeded >
|
||||
perThreadDataSizeForWholeThreadGroupAllocated) {
|
||||
alignedFree(perThreadDataForWholeThreadGroup);
|
||||
@@ -405,7 +405,7 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
|
||||
static_cast<uint16_t>(groupSizeY),
|
||||
static_cast<uint16_t>(groupSizeZ)}},
|
||||
std::array<uint8_t, 3>{{0, 1, 2}},
|
||||
false, grfSize, rootDeviceEnvironment);
|
||||
false, grfSize, grfCount, rootDeviceEnvironment);
|
||||
}
|
||||
|
||||
this->perThreadDataSize = perThreadDataSizeForWholeThreadGroup / numThreadsPerThreadGroup;
|
||||
|
||||
@@ -1029,12 +1029,13 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CmdlistAppendLaunchKernelWithImplicitArgsTests, giv
|
||||
dispatchKernelWithImplicitArgs<FamilyType>();
|
||||
|
||||
auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth);
|
||||
auto numGrf = GrfConfig::defaultGrfNumber;
|
||||
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgs::getSize(), MemoryConstants::cacheLineSize);
|
||||
const auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironment();
|
||||
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize, rootDeviceEnvironment);
|
||||
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize, numGrf, rootDeviceEnvironment);
|
||||
|
||||
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgs::getSize();
|
||||
size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);
|
||||
size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, numGrf, 3u, totalLocalSize, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);
|
||||
|
||||
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
|
||||
alignedFree(expectedLocalIds);
|
||||
@@ -1075,12 +1076,13 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CmdlistAppendLaunchKernelWithImplicitArgsTests, giv
|
||||
dispatchKernelWithImplicitArgs<FamilyType>();
|
||||
|
||||
auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth);
|
||||
auto numGrf = GrfConfig::defaultGrfNumber;
|
||||
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgs::getSize(), MemoryConstants::cacheLineSize);
|
||||
const auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironment();
|
||||
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize, rootDeviceEnvironment);
|
||||
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize, numGrf, rootDeviceEnvironment);
|
||||
|
||||
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgs::getSize();
|
||||
size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);
|
||||
size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, numGrf, 3u, totalLocalSize, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);
|
||||
|
||||
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
|
||||
alignedFree(expectedLocalIds);
|
||||
|
||||
@@ -306,6 +306,7 @@ TEST_F(KernelImpSetGroupSizeTest, givenLocalIdGenerationByRuntimeEnabledWhenSett
|
||||
mockKernel.module = &mockModule;
|
||||
const auto &device = mockModule.getDevice();
|
||||
auto grfSize = device->getHwInfo().capabilityTable.grfSize;
|
||||
auto numGrf = GrfConfig::defaultGrfNumber;
|
||||
const auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironment();
|
||||
uint32_t groupSize[3] = {2, 3, 5};
|
||||
auto ret = mockKernel.setGroupSize(groupSize[0], groupSize[1], groupSize[2]);
|
||||
@@ -315,13 +316,14 @@ TEST_F(KernelImpSetGroupSizeTest, givenLocalIdGenerationByRuntimeEnabledWhenSett
|
||||
auto numThreadsPerTG = gfxHelper.calculateNumThreadsPerThreadGroup(
|
||||
mockKernel.descriptor.kernelAttributes.simdSize,
|
||||
groupSize[0] * groupSize[1] * groupSize[2],
|
||||
grfSize,
|
||||
numGrf,
|
||||
mockKernel.kernelRequiresGenerationOfLocalIdsByRuntime,
|
||||
rootDeviceEnvironment);
|
||||
auto perThreadDataSizeForWholeTGNeeded =
|
||||
static_cast<uint32_t>(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(
|
||||
mockKernel.descriptor.kernelAttributes.simdSize,
|
||||
grfSize,
|
||||
numGrf,
|
||||
mockKernel.descriptor.kernelAttributes.numLocalIdChannels,
|
||||
groupSize[0] * groupSize[1] * groupSize[2],
|
||||
!mockKernel.kernelRequiresGenerationOfLocalIdsByRuntime,
|
||||
|
||||
@@ -55,6 +55,7 @@ size_t HardwareCommandsHelper<GfxFamily>::getSizeRequiredIOH(const Kernel &kerne
|
||||
const auto &hwInfo = kernel.getHardwareInfo();
|
||||
|
||||
auto numChannels = kernelDescriptor.kernelAttributes.numLocalIdChannels;
|
||||
auto grfCount = kernelDescriptor.kernelAttributes.numGrfRequired;
|
||||
uint32_t grfSize = hwInfo.capabilityTable.grfSize;
|
||||
auto simdSize = kernelDescriptor.kernelAttributes.simdSize;
|
||||
uint32_t requiredWalkOrder = 0u;
|
||||
@@ -69,7 +70,7 @@ size_t HardwareCommandsHelper<GfxFamily>::getSizeRequiredIOH(const Kernel &kerne
|
||||
requiredWalkOrder,
|
||||
simdSize);
|
||||
auto size = kernel.getCrossThreadDataSize() +
|
||||
getPerThreadDataSizeTotal(simdSize, grfSize, numChannels, localWorkSize, isHwLocalIdGeneration, rootDeviceEnvironment);
|
||||
getPerThreadDataSizeTotal(simdSize, grfSize, grfCount, numChannels, localWorkSize, isHwLocalIdGeneration, rootDeviceEnvironment);
|
||||
|
||||
auto pImplicitArgs = kernel.getImplicitArgs();
|
||||
if (pImplicitArgs) {
|
||||
@@ -268,9 +269,9 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
|
||||
}
|
||||
|
||||
auto &gfxCoreHelper = device.getGfxCoreHelper();
|
||||
auto grfSize = kernel.getDescriptor().kernelAttributes.numGrfRequired;
|
||||
auto grfCount = kernel.getDescriptor().kernelAttributes.numGrfRequired;
|
||||
auto localWorkItems = localWorkSize[0] * localWorkSize[1] * localWorkSize[2];
|
||||
auto threadsPerThreadGroup = gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkItems), grfSize, !localIdsGenerationByRuntime, device.getRootDeviceEnvironment());
|
||||
auto threadsPerThreadGroup = gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkItems), grfCount, !localIdsGenerationByRuntime, device.getRootDeviceEnvironment());
|
||||
|
||||
uint32_t sizeCrossThreadData = kernel.getCrossThreadDataSize();
|
||||
|
||||
|
||||
@@ -2281,8 +2281,9 @@ void Kernel::initializeLocalIdsCache() {
|
||||
workgroupDimensionsOrder[1],
|
||||
workgroupDimensionsOrder[2]};
|
||||
auto simdSize = getDescriptor().kernelAttributes.simdSize;
|
||||
auto grfCount = getDescriptor().kernelAttributes.numGrfRequired;
|
||||
auto grfSize = static_cast<uint8_t>(getDevice().getHardwareInfo().capabilityTable.grfSize);
|
||||
localIdsCache = std::make_unique<LocalIdsCache>(4, wgDimOrder, simdSize, grfSize, usingImagesOnly);
|
||||
localIdsCache = std::make_unique<LocalIdsCache>(4, wgDimOrder, grfCount, simdSize, grfSize, usingImagesOnly);
|
||||
}
|
||||
|
||||
void Kernel::setLocalIdsForGroup(const Vec3<uint16_t> &groupSize, void *destination) const {
|
||||
|
||||
@@ -1379,9 +1379,10 @@ HWTEST_F(DispatchWalkerTest, WhenKernelRequiresImplicitArgsThenIohRequiresMoreSp
|
||||
auto numChannels = kernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels;
|
||||
auto simdSize = kernelInfo.getMaxSimdSize();
|
||||
uint32_t grfSize = sizeof(typename FamilyType::GRF);
|
||||
auto numGrf = GrfConfig::defaultGrfNumber;
|
||||
|
||||
auto size = kernelWithImplicitArgs.getCrossThreadDataSize() +
|
||||
HardwareCommandsHelper<FamilyType>::getPerThreadDataSizeTotal(simdSize, grfSize, numChannels, Math::computeTotalElementsCount(workGroupSize), false, rootDeviceEnvironment) +
|
||||
HardwareCommandsHelper<FamilyType>::getPerThreadDataSizeTotal(simdSize, grfSize, numGrf, numChannels, Math::computeTotalElementsCount(workGroupSize), false, rootDeviceEnvironment) +
|
||||
ImplicitArgsHelper::getSizeForImplicitArgsPatching(kernelWithImplicitArgs.getImplicitArgs(), kernelWithImplicitArgs.getDescriptor(), false, rootDeviceEnvironment);
|
||||
|
||||
size = alignUp(size, MemoryConstants::cacheLineSize);
|
||||
|
||||
@@ -565,8 +565,9 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, whenSendingIndirectStateThenKe
|
||||
constexpr uint32_t grfSize = sizeof(typename FamilyType::GRF);
|
||||
size_t localWorkSize = localWorkSizeX * localWorkSizeY * localWorkSizeZ;
|
||||
auto numChannels = modifiedKernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels;
|
||||
auto numGrf = GrfConfig::defaultGrfNumber;
|
||||
const auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment();
|
||||
size_t expectedIohSize = PerThreadDataHelper::getPerThreadDataSizeTotal(modifiedKernelInfo.getMaxSimdSize(), grfSize, numChannels, localWorkSize, !kernelUsesLocalIds, rootDeviceEnvironment);
|
||||
size_t expectedIohSize = PerThreadDataHelper::getPerThreadDataSizeTotal(modifiedKernelInfo.getMaxSimdSize(), grfSize, numGrf, numChannels, localWorkSize, !kernelUsesLocalIds, rootDeviceEnvironment);
|
||||
ASSERT_LE(expectedIohSize, ioh.getUsed());
|
||||
|
||||
auto expectedLocalIds = alignedMalloc(expectedIohSize, 64);
|
||||
@@ -575,7 +576,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, whenSendingIndirectStateThenKe
|
||||
std::array<uint8_t, 3>{{modifiedKernelInfo.kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[0],
|
||||
modifiedKernelInfo.kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[1],
|
||||
modifiedKernelInfo.kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[2]}},
|
||||
false, grfSize, rootDeviceEnvironment);
|
||||
false, grfSize, numGrf, rootDeviceEnvironment);
|
||||
|
||||
EXPECT_EQ(0, memcmp(expectedLocalIds, ioh.getCpuBase(), expectedIohSize));
|
||||
alignedFree(expectedLocalIds);
|
||||
@@ -1224,12 +1225,13 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithI
|
||||
dispatchKernelWithImplicitArgs<FamilyType>();
|
||||
|
||||
auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth);
|
||||
auto numGrf = GrfConfig::defaultGrfNumber;
|
||||
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgs::getSize(), MemoryConstants::cacheLineSize);
|
||||
const auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment();
|
||||
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize, rootDeviceEnvironment);
|
||||
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize, numGrf, rootDeviceEnvironment);
|
||||
|
||||
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgs::getSize();
|
||||
size_t sizeForLocalIds = PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize, false, rootDeviceEnvironment);
|
||||
size_t sizeForLocalIds = PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, numGrf, 3u, totalLocalSize, false, rootDeviceEnvironment);
|
||||
|
||||
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
|
||||
alignedFree(expectedLocalIds);
|
||||
@@ -1258,12 +1260,13 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithI
|
||||
dispatchKernelWithImplicitArgs<FamilyType>();
|
||||
|
||||
auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth);
|
||||
auto numGrf = GrfConfig::defaultGrfNumber;
|
||||
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgs::getSize(), MemoryConstants::cacheLineSize);
|
||||
const auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment();
|
||||
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize, rootDeviceEnvironment);
|
||||
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize, numGrf, rootDeviceEnvironment);
|
||||
|
||||
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgs::getSize();
|
||||
size_t sizeForLocalIds = PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize, false, rootDeviceEnvironment);
|
||||
size_t sizeForLocalIds = PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, numGrf, 3u, totalLocalSize, false, rootDeviceEnvironment);
|
||||
|
||||
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
|
||||
alignedFree(expectedLocalIds);
|
||||
|
||||
@@ -137,7 +137,7 @@ struct EncodeDispatchKernel {
|
||||
const RootDeviceEnvironment &rootDeviceEnvironment);
|
||||
|
||||
template <typename InterfaceDescriptorType>
|
||||
static void setGrfInfo(InterfaceDescriptorType *pInterfaceDescriptor, uint32_t numGrf, const size_t &sizeCrossThreadData,
|
||||
static void setGrfInfo(InterfaceDescriptorType *pInterfaceDescriptor, uint32_t grfCount, const size_t &sizeCrossThreadData,
|
||||
const size_t &sizePerThreadData, const RootDeviceEnvironment &rootDeviceEnvironment);
|
||||
|
||||
static void *getInterfaceDescriptor(CommandContainer &container, IndirectHeap *childDsh, uint32_t &iddOffset);
|
||||
@@ -155,10 +155,10 @@ struct EncodeDispatchKernel {
|
||||
static void programBarrierEnable(InterfaceDescriptorType &interfaceDescriptor, uint32_t value, const HardwareInfo &hwInfo);
|
||||
|
||||
template <typename WalkerType, typename InterfaceDescriptorType>
|
||||
static void adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WalkerType &walkerCmd);
|
||||
static void adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t grfCount, WalkerType &walkerCmd);
|
||||
|
||||
template <typename WalkerType, typename InterfaceDescriptorType>
|
||||
static void adjustInterfaceDescriptorDataForOverdispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WalkerType &walkerCmd);
|
||||
static void adjustInterfaceDescriptorDataForOverdispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t grfCount, WalkerType &walkerCmd);
|
||||
|
||||
static void adjustBindingTablePrefetch(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t samplerCount, uint32_t bindingTableEntryCount);
|
||||
|
||||
|
||||
@@ -729,7 +729,7 @@ void EncodeDispatchKernel<Family>::adjustBindingTablePrefetch(INTERFACE_DESCRIPT
|
||||
|
||||
template <typename Family>
|
||||
template <typename WalkerType, typename InterfaceDescriptorType>
|
||||
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WalkerType &walkerCmd) {}
|
||||
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t grfCount, WalkerType &walkerCmd) {}
|
||||
|
||||
template <typename Family>
|
||||
size_t EncodeDispatchKernel<Family>::getSizeRequiredDsh(const KernelDescriptor &kernelDescriptor, uint32_t iddCount) {
|
||||
@@ -759,7 +759,7 @@ size_t EncodeDispatchKernel<Family>::getSizeRequiredDsh(const KernelDescriptor &
|
||||
|
||||
template <typename GfxFamily>
|
||||
template <typename WalkerType, typename InterfaceDescriptorType>
|
||||
void EncodeDispatchKernel<GfxFamily>::adjustInterfaceDescriptorDataForOverdispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WalkerType &walkerCmd) {
|
||||
void EncodeDispatchKernel<GfxFamily>::adjustInterfaceDescriptorDataForOverdispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t grfCount, WalkerType &walkerCmd) {
|
||||
const auto &productHelper = device.getProductHelper();
|
||||
|
||||
if (productHelper.isDisableOverdispatchAvailable(hwInfo)) {
|
||||
@@ -777,7 +777,7 @@ void EncodeDispatchKernel<GfxFamily>::adjustInterfaceDescriptorDataForOverdispat
|
||||
|
||||
if (algorithmVersion == 2) {
|
||||
auto threadsPerXeCore = hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.MaxSubSlicesSupported;
|
||||
if (numGrf == 256) {
|
||||
if (grfCount == 256) {
|
||||
threadsPerXeCore /= 2;
|
||||
}
|
||||
auto tgDispatchSizeSelected = 8;
|
||||
@@ -819,11 +819,11 @@ void EncodeDispatchKernel<GfxFamily>::adjustInterfaceDescriptorDataForOverdispat
|
||||
}
|
||||
} else {
|
||||
if (adjustTGDispatchSize) {
|
||||
UNRECOVERABLE_IF(numGrf == 0u);
|
||||
UNRECOVERABLE_IF(grfCount == 0u);
|
||||
constexpr uint32_t maxThreadsInTGForTGDispatchSize8 = 16u;
|
||||
constexpr uint32_t maxThreadsInTGForTGDispatchSize4 = 32u;
|
||||
auto &gfxCoreHelper = device.getGfxCoreHelper();
|
||||
uint32_t availableThreadCount = gfxCoreHelper.calculateAvailableThreadCount(hwInfo, numGrf);
|
||||
uint32_t availableThreadCount = gfxCoreHelper.calculateAvailableThreadCount(hwInfo, grfCount);
|
||||
if (ImplicitScalingHelper::isImplicitScalingEnabled(device.getDeviceBitfield(), true)) {
|
||||
const uint32_t tilesCount = device.getNumSubDevices();
|
||||
availableThreadCount *= tilesCount;
|
||||
|
||||
@@ -31,7 +31,7 @@ namespace NEO {
|
||||
|
||||
template <typename Family>
|
||||
template <typename InterfaceDescriptorType>
|
||||
void EncodeDispatchKernel<Family>::setGrfInfo(InterfaceDescriptorType *pInterfaceDescriptor, uint32_t numGrf,
|
||||
void EncodeDispatchKernel<Family>::setGrfInfo(InterfaceDescriptorType *pInterfaceDescriptor, uint32_t grfCount,
|
||||
const size_t &sizeCrossThreadData, const size_t &sizePerThreadData,
|
||||
const RootDeviceEnvironment &rootDeviceEnvironment) {
|
||||
auto grfSize = sizeof(typename Family::GRF);
|
||||
|
||||
@@ -14,9 +14,9 @@ template void NEO::EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields<Fa
|
||||
template void NEO::EncodeDispatchKernel<Family>::adjustTimestampPacket<Family::DefaultWalkerType>(Family::DefaultWalkerType &walkerCmd, const EncodeDispatchKernelArgs &args);
|
||||
template void NEO::EncodeDispatchKernel<Family>::setupPostSyncForRegularEvent<Family::DefaultWalkerType>(Family::DefaultWalkerType &walkerCmd, const EncodeDispatchKernelArgs &args);
|
||||
template void NEO::EncodeDispatchKernel<Family>::setupPostSyncForInOrderExec<Family::DefaultWalkerType>(Family::DefaultWalkerType &walkerCmd, const EncodeDispatchKernelArgs &args);
|
||||
template void NEO::EncodeDispatchKernel<Family>::setGrfInfo<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, uint32_t numGrf, const size_t &sizeCrossThreadData, const size_t &sizePerThreadData, const RootDeviceEnvironment &rootDeviceEnvironment);
|
||||
template void NEO::EncodeDispatchKernel<Family>::setGrfInfo<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, uint32_t grfCount, const size_t &sizeCrossThreadData, const size_t &sizePerThreadData, const RootDeviceEnvironment &rootDeviceEnvironment);
|
||||
template void NEO::EncodeDispatchKernel<Family>::appendAdditionalIDDFields<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy);
|
||||
template void NEO::EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData<Family::DefaultWalkerType, Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, Family::DefaultWalkerType &walkerCmd);
|
||||
template void NEO::EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData<Family::DefaultWalkerType, Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t grfCount, Family::DefaultWalkerType &walkerCmd);
|
||||
template void NEO::EncodeDispatchKernel<Family>::setupPostSyncMocs<Family::DefaultWalkerType>(Family::DefaultWalkerType &walkerCmd, const RootDeviceEnvironment &rootDeviceEnvironment, bool dcFlush);
|
||||
template void NEO::EncodeDispatchKernel<Family>::encode<Family::DefaultWalkerType>(CommandContainer &container, EncodeDispatchKernelArgs &args);
|
||||
template void NEO::EncodeDispatchKernel<Family>::encodeThreadData<Family::DefaultWalkerType>(Family::DefaultWalkerType &walkerCmd, const uint32_t *startWorkGroup, const uint32_t *numWorkGroups, const uint32_t *workGroupSizes, uint32_t simd, uint32_t localIdDimensions, uint32_t threadsPerThreadGroup, uint32_t threadExecutionMask, bool localIdsGenerationByRuntime, bool inlineDataProgrammingRequired, bool isIndirect, uint32_t requiredWorkGroupOrder, const RootDeviceEnvironment &rootDeviceEnvironment);
|
||||
|
||||
@@ -41,7 +41,7 @@ constexpr size_t immWriteDestinationAddressAlignment = 8;
|
||||
|
||||
template <typename Family>
|
||||
template <typename InterfaceDescriptorType>
|
||||
void EncodeDispatchKernel<Family>::setGrfInfo(InterfaceDescriptorType *pInterfaceDescriptor, uint32_t numGrf,
|
||||
void EncodeDispatchKernel<Family>::setGrfInfo(InterfaceDescriptorType *pInterfaceDescriptor, uint32_t grfCount,
|
||||
const size_t &sizeCrossThreadData, const size_t &sizePerThreadData,
|
||||
const RootDeviceEnvironment &rootDeviceEnvironment) {
|
||||
}
|
||||
@@ -377,7 +377,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
||||
auto threadGroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension();
|
||||
EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(idd, *args.device, hwInfo, threadGroupCount, kernelDescriptor.kernelAttributes.numGrfRequired, walkerCmd);
|
||||
if (debugManager.flags.PrintKernelDispatchParameters.get()) {
|
||||
fprintf(stdout, "kernel, %s, numGrf, %d, simdSize, %d, tilesCount, %d, implicitScaling, %s, threadGroupCount, %d, numberOfThreadsInGpgpuThreadGroup, %d, threadGroupDimensions, %d, %d, %d, threadGroupDispatchSize enum, %d\n",
|
||||
fprintf(stdout, "kernel, %s, grfCount, %d, simdSize, %d, tilesCount, %d, implicitScaling, %s, threadGroupCount, %d, numberOfThreadsInGpgpuThreadGroup, %d, threadGroupDimensions, %d, %d, %d, threadGroupDispatchSize enum, %d\n",
|
||||
kernelDescriptor.kernelMetadata.kernelName.c_str(),
|
||||
kernelDescriptor.kernelAttributes.numGrfRequired,
|
||||
kernelDescriptor.kernelAttributes.simdSize,
|
||||
|
||||
@@ -43,10 +43,10 @@ LocalIDHelper::LocalIDHelper() {
|
||||
|
||||
LocalIDHelper LocalIDHelper::initializer;
|
||||
|
||||
void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize, const RootDeviceEnvironment &rootDeviceEnvironment) {
|
||||
void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize, uint32_t grfCount, const RootDeviceEnvironment &rootDeviceEnvironment) {
|
||||
bool localIdsGeneratedByHw = false;
|
||||
auto &gfxCoreHelper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
|
||||
auto threadsPerWorkGroup = static_cast<uint16_t>(gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]), grfSize, localIdsGeneratedByHw, rootDeviceEnvironment));
|
||||
auto threadsPerWorkGroup = static_cast<uint16_t>(gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]), grfCount, localIdsGeneratedByHw, rootDeviceEnvironment));
|
||||
bool useLayoutForImages = isImageOnlyKernel && isCompatibleWithLayoutForImages(localWorkgroupSize, dimensionsOrder, simd);
|
||||
if (useLayoutForImages) {
|
||||
generateLocalIDsWithLayoutForImages(buffer, localWorkgroupSize, simd);
|
||||
|
||||
@@ -125,7 +125,7 @@ class GfxCoreHelper {
|
||||
virtual bool isCooperativeDispatchSupported(const EngineGroupType engineGroupType, const RootDeviceEnvironment &rootDeviceEnvironment) const = 0;
|
||||
virtual uint32_t adjustMaxWorkGroupCount(uint32_t maxWorkGroupCount, const EngineGroupType engineGroupType,
|
||||
const RootDeviceEnvironment &rootDeviceEnvironment, bool isEngineInstanced) const = 0;
|
||||
virtual uint32_t adjustMaxWorkGroupSize(const uint32_t numGrf, const uint32_t simd, bool isHwLocalGeneration, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const = 0;
|
||||
virtual uint32_t adjustMaxWorkGroupSize(const uint32_t grfCount, const uint32_t simd, bool isHwLocalGeneration, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const = 0;
|
||||
virtual size_t getMaxFillPaternSizeForCopyEngine() const = 0;
|
||||
virtual size_t getSipKernelMaxDbgSurfaceSize(const HardwareInfo &hwInfo) const = 0;
|
||||
virtual bool isCpuImageTransferPreferred(const HardwareInfo &hwInfo) const = 0;
|
||||
@@ -174,7 +174,7 @@ class GfxCoreHelper {
|
||||
virtual bool isChipsetUniqueUUIDSupported() const = 0;
|
||||
virtual bool isTimestampShiftRequired() const = 0;
|
||||
virtual bool isRelaxedOrderingSupported() const = 0;
|
||||
virtual uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalIdGeneration, const RootDeviceEnvironment &rootDeviceEnvironment) const = 0;
|
||||
virtual uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfCount, bool isHwLocalIdGeneration, const RootDeviceEnvironment &rootDeviceEnvironment) const = 0;
|
||||
virtual uint32_t overrideMaxWorkGroupSize(uint32_t maxWG) const = 0;
|
||||
virtual char const *getDefaultDeviceHierarchy() const = 0;
|
||||
static bool isWorkaroundRequired(uint32_t lowestSteppingWithBug, uint32_t steppingWithFix, const HardwareInfo &hwInfo, const ProductHelper &productHelper);
|
||||
@@ -341,7 +341,7 @@ class GfxCoreHelperHw : public GfxCoreHelper {
|
||||
uint32_t adjustMaxWorkGroupCount(uint32_t maxWorkGroupCount, const EngineGroupType engineGroupType,
|
||||
const RootDeviceEnvironment &rootDeviceEnvironment, bool isEngineInstanced) const override;
|
||||
|
||||
uint32_t adjustMaxWorkGroupSize(const uint32_t numGrf, const uint32_t simd, bool isHwLocalGeneration, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const override;
|
||||
uint32_t adjustMaxWorkGroupSize(const uint32_t grfCount, const uint32_t simd, bool isHwLocalGeneration, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const override;
|
||||
size_t getMaxFillPaternSizeForCopyEngine() const override;
|
||||
|
||||
size_t getSipKernelMaxDbgSurfaceSize(const HardwareInfo &hwInfo) const override;
|
||||
@@ -401,7 +401,7 @@ class GfxCoreHelperHw : public GfxCoreHelper {
|
||||
bool isChipsetUniqueUUIDSupported() const override;
|
||||
bool isTimestampShiftRequired() const override;
|
||||
bool isRelaxedOrderingSupported() const override;
|
||||
uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalIdGeneration, const RootDeviceEnvironment &rootDeviceEnvironment) const override;
|
||||
uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfCount, bool isHwLocalIdGeneration, const RootDeviceEnvironment &rootDeviceEnvironment) const override;
|
||||
uint32_t overrideMaxWorkGroupSize(uint32_t maxWG) const override;
|
||||
char const *getDefaultDeviceHierarchy() const override;
|
||||
|
||||
|
||||
@@ -707,7 +707,7 @@ uint32_t GfxCoreHelperHw<GfxFamily>::overrideMaxWorkGroupSize(uint32_t maxWG) co
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
uint32_t GfxCoreHelperHw<GfxFamily>::adjustMaxWorkGroupSize(const uint32_t numGrf, const uint32_t simd, bool isHwLocalGeneration, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const {
|
||||
uint32_t GfxCoreHelperHw<GfxFamily>::adjustMaxWorkGroupSize(const uint32_t grfCount, const uint32_t simd, bool isHwLocalGeneration, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const {
|
||||
return defaultMaxGroupSize;
|
||||
}
|
||||
|
||||
@@ -717,7 +717,7 @@ uint32_t GfxCoreHelperHw<GfxFamily>::getMinimalGrfSize() const {
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
uint32_t GfxCoreHelperHw<GfxFamily>::calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalIdGeneration, const RootDeviceEnvironment &rootDeviceEnvironment) const {
|
||||
uint32_t GfxCoreHelperHw<GfxFamily>::calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfCount, bool isHwLocalIdGeneration, const RootDeviceEnvironment &rootDeviceEnvironment) const {
|
||||
return getThreadsPerWG(simd, totalWorkItems);
|
||||
}
|
||||
|
||||
|
||||
@@ -64,7 +64,7 @@ void generateLocalIDsSimd(void *b, const std::array<uint16_t, 3> &localWorkgroup
|
||||
const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize);
|
||||
|
||||
void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize,
|
||||
const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize, const RootDeviceEnvironment &rootDeviceEnvironment);
|
||||
const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize, uint32_t grfCount, const RootDeviceEnvironment &rootDeviceEnvironment);
|
||||
void generateLocalIDsWithLayoutForImages(void *b, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t simd);
|
||||
|
||||
bool isCompatibleWithLayoutForImages(const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, uint16_t simd);
|
||||
|
||||
@@ -21,6 +21,7 @@ struct PerThreadDataHelper {
|
||||
static inline size_t getPerThreadDataSizeTotal(
|
||||
uint32_t simd,
|
||||
uint32_t grfSize,
|
||||
uint32_t grfCount,
|
||||
uint32_t numChannels,
|
||||
size_t localWorkSize,
|
||||
bool isHwLocalIdGeneration,
|
||||
@@ -30,7 +31,7 @@ struct PerThreadDataHelper {
|
||||
return perThreadSizeLocalIDs * localWorkSize;
|
||||
}
|
||||
auto &gfxCoreHelper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
|
||||
return perThreadSizeLocalIDs * gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkSize), grfSize, isHwLocalIdGeneration, rootDeviceEnvironment);
|
||||
return perThreadSizeLocalIDs * gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkSize), grfCount, isHwLocalIdGeneration, rootDeviceEnvironment);
|
||||
}
|
||||
}; // namespace PerThreadDataHelper
|
||||
} // namespace NEO
|
||||
|
||||
@@ -46,10 +46,10 @@ LocalIDHelper::LocalIDHelper() {
|
||||
LocalIDHelper LocalIDHelper::initializer;
|
||||
|
||||
// traditional function to generate local IDs
|
||||
void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize, const RootDeviceEnvironment &rootDeviceEnvironment) {
|
||||
void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize, uint32_t grfCount, const RootDeviceEnvironment &rootDeviceEnvironment) {
|
||||
bool localIdsGeneratedByHw = false;
|
||||
auto &gfxCoreHelper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
|
||||
auto threadsPerWorkGroup = static_cast<uint16_t>(gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]), grfSize, localIdsGeneratedByHw, rootDeviceEnvironment));
|
||||
auto threadsPerWorkGroup = static_cast<uint16_t>(gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]), grfCount, localIdsGeneratedByHw, rootDeviceEnvironment));
|
||||
bool useLayoutForImages = isImageOnlyKernel && isCompatibleWithLayoutForImages(localWorkgroupSize, dimensionsOrder, simd);
|
||||
if (useLayoutForImages) {
|
||||
generateLocalIDsWithLayoutForImages(buffer, localWorkgroupSize, simd);
|
||||
|
||||
@@ -54,12 +54,13 @@ uint32_t getSizeForImplicitArgsPatching(const ImplicitArgs *pImplicitArgs, const
|
||||
return alignUp(implicitArgsSize, MemoryConstants::cacheLineSize);
|
||||
} else {
|
||||
auto simdSize = pImplicitArgs->simdWidth;
|
||||
auto grfCount = kernelDescriptor.kernelAttributes.numGrfRequired;
|
||||
auto grfSize = NEO::ImplicitArgsHelper::getGrfSize(simdSize);
|
||||
Vec3<size_t> localWorkSize = {pImplicitArgs->localSizeX, pImplicitArgs->localSizeY, pImplicitArgs->localSizeZ};
|
||||
auto itemsInGroup = Math::computeTotalElementsCount(localWorkSize);
|
||||
uint32_t localIdsSizeNeeded =
|
||||
alignUp(static_cast<uint32_t>(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(
|
||||
simdSize, grfSize, 3u, itemsInGroup, isHwLocalIdGeneration, rootDeviceEnvironment)),
|
||||
simdSize, grfSize, grfCount, 3u, itemsInGroup, isHwLocalIdGeneration, rootDeviceEnvironment)),
|
||||
MemoryConstants::cacheLineSize);
|
||||
return implicitArgsSize + localIdsSizeNeeded;
|
||||
}
|
||||
@@ -74,6 +75,7 @@ void *patchImplicitArgs(void *ptrToPatch, const ImplicitArgs &implicitArgs, cons
|
||||
if (!patchImplicitArgsBufferInCrossThread) {
|
||||
auto simdSize = implicitArgs.simdWidth;
|
||||
auto grfSize = getGrfSize(simdSize);
|
||||
auto grfCount = kernelDescriptor.kernelAttributes.numGrfRequired;
|
||||
auto dimensionOrder = getDimensionOrderForLocalIds(kernelDescriptor.kernelAttributes.workgroupDimensionsOrder, hwGenerationOfLocalIdsParams);
|
||||
|
||||
NEO::generateLocalIDs(
|
||||
@@ -83,7 +85,7 @@ void *patchImplicitArgs(void *ptrToPatch, const ImplicitArgs &implicitArgs, cons
|
||||
static_cast<uint16_t>(implicitArgs.localSizeY),
|
||||
static_cast<uint16_t>(implicitArgs.localSizeZ)}},
|
||||
dimensionOrder,
|
||||
false, grfSize, rootDeviceEnvironment);
|
||||
false, grfSize, grfCount, rootDeviceEnvironment);
|
||||
auto sizeForLocalIdsProgramming = totalSizeToProgram - ImplicitArgs::getSize();
|
||||
ptrToPatch = ptrOffset(ptrToPatch, sizeForLocalIdsProgramming);
|
||||
}
|
||||
|
||||
@@ -13,14 +13,15 @@
|
||||
#include "shared/source/helpers/gfx_core_helper.h"
|
||||
#include "shared/source/helpers/local_id_gen.h"
|
||||
#include "shared/source/helpers/simd_helper.h"
|
||||
#include "shared/source/kernel/grf_config.h"
|
||||
|
||||
#include <cstring>
|
||||
|
||||
namespace NEO {
|
||||
|
||||
LocalIdsCache::LocalIdsCache(size_t cacheSize, std::array<uint8_t, 3> wgDimOrder, uint8_t simdSize, uint8_t grfSize, bool usesOnlyImages)
|
||||
LocalIdsCache::LocalIdsCache(size_t cacheSize, std::array<uint8_t, 3> wgDimOrder, uint32_t grfCount, uint8_t simdSize, uint8_t grfSize, bool usesOnlyImages)
|
||||
: wgDimOrder(wgDimOrder), localIdsSizePerThread(getPerThreadSizeLocalIDs(static_cast<uint32_t>(simdSize), static_cast<uint32_t>(grfSize))),
|
||||
grfSize(grfSize), simdSize(simdSize), usesOnlyImages(usesOnlyImages) {
|
||||
grfCount(grfCount), grfSize(grfSize), simdSize(simdSize), usesOnlyImages(usesOnlyImages) {
|
||||
UNRECOVERABLE_IF(cacheSize == 0)
|
||||
cache.resize(cacheSize);
|
||||
}
|
||||
@@ -41,7 +42,7 @@ size_t LocalIdsCache::getLocalIdsSizeForGroup(const Vec3<uint16_t> &group, const
|
||||
return static_cast<size_t>(numElementsInGroup * localIdsSizePerThread);
|
||||
}
|
||||
auto &gfxCoreHelper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
|
||||
const auto numberOfThreads = gfxCoreHelper.calculateNumThreadsPerThreadGroup(simdSize, numElementsInGroup, grfSize, false, rootDeviceEnvironment);
|
||||
const auto numberOfThreads = gfxCoreHelper.calculateNumThreadsPerThreadGroup(simdSize, numElementsInGroup, grfCount, false, rootDeviceEnvironment);
|
||||
return static_cast<size_t>(numberOfThreads * localIdsSizePerThread);
|
||||
}
|
||||
|
||||
@@ -81,7 +82,7 @@ void LocalIdsCache::commitNewEntry(LocalIdsCacheEntry &entry, const Vec3<uint16_
|
||||
entry.localIdsSizeAllocated = entry.localIdsSize;
|
||||
}
|
||||
NEO::generateLocalIDs(entry.localIdsData, static_cast<uint16_t>(simdSize),
|
||||
{group[0], group[1], group[2]}, wgDimOrder, usesOnlyImages, grfSize, rootDeviceEnvironment);
|
||||
{group[0], group[1], group[2]}, wgDimOrder, usesOnlyImages, grfSize, grfCount, rootDeviceEnvironment);
|
||||
}
|
||||
|
||||
} // namespace NEO
|
||||
@@ -27,7 +27,7 @@ class LocalIdsCache {
|
||||
LocalIdsCache(LocalIdsCache &) = delete;
|
||||
LocalIdsCache &operator=(const LocalIdsCache &other) = delete;
|
||||
|
||||
LocalIdsCache(size_t cacheSize, std::array<uint8_t, 3> wgDimOrder, uint8_t simdSize, uint8_t grfSize, bool usesOnlyImages = false);
|
||||
LocalIdsCache(size_t cacheSize, std::array<uint8_t, 3> wgDimOrder, uint32_t grfCount, uint8_t simdSize, uint8_t grfSize, bool usesOnlyImages = false);
|
||||
~LocalIdsCache();
|
||||
|
||||
void setLocalIdsForGroup(const Vec3<uint16_t> &group, void *destination, const RootDeviceEnvironment &rootDeviceEnvironment);
|
||||
@@ -43,6 +43,7 @@ class LocalIdsCache {
|
||||
std::mutex setLocalIdsMutex;
|
||||
const std::array<uint8_t, 3> wgDimOrder;
|
||||
const uint32_t localIdsSizePerThread;
|
||||
const uint32_t grfCount;
|
||||
const uint8_t grfSize;
|
||||
const uint8_t simdSize;
|
||||
const bool usesOnlyImages;
|
||||
|
||||
@@ -28,8 +28,8 @@ namespace NEO {
|
||||
|
||||
template <>
|
||||
template <typename WalkerType, typename InterfaceDescriptorType>
|
||||
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WalkerType &walkerCmd) {
|
||||
EncodeDispatchKernel<Family>::adjustInterfaceDescriptorDataForOverdispatch(interfaceDescriptor, device, hwInfo, threadGroupCount, numGrf, walkerCmd);
|
||||
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t grfCount, WalkerType &walkerCmd) {
|
||||
EncodeDispatchKernel<Family>::adjustInterfaceDescriptorDataForOverdispatch(interfaceDescriptor, device, hwInfo, threadGroupCount, grfCount, walkerCmd);
|
||||
}
|
||||
|
||||
template <>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2021-2023 Intel Corporation
|
||||
* Copyright (C) 2021-2024 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -87,7 +87,7 @@ void EncodeDispatchKernel<Family>::appendAdditionalIDDFields(InterfaceDescriptor
|
||||
|
||||
template <>
|
||||
template <typename WalkerType, typename InterfaceDescriptorType>
|
||||
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WalkerType &walkerCmd) {
|
||||
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t grfCount, WalkerType &walkerCmd) {
|
||||
const auto &productHelper = device.getProductHelper();
|
||||
if (productHelper.isDisableOverdispatchAvailable(hwInfo)) {
|
||||
if (interfaceDescriptor.getNumberOfThreadsInGpgpuThreadGroup() == 1) {
|
||||
|
||||
@@ -1124,7 +1124,7 @@ HWTEST2_F(EncodeDispatchKernelTest, givenPrintKernelDispatchParametersWhenEncodi
|
||||
std::string outputString = testing::internal::GetCapturedStdout(); // stop capturing
|
||||
|
||||
EXPECT_NE(std::string::npos, outputString.find("kernel"));
|
||||
EXPECT_NE(std::string::npos, outputString.find("numGrf"));
|
||||
EXPECT_NE(std::string::npos, outputString.find("grfCount"));
|
||||
EXPECT_NE(std::string::npos, outputString.find("simdSize"));
|
||||
EXPECT_NE(std::string::npos, outputString.find("tilesCount"));
|
||||
EXPECT_NE(std::string::npos, outputString.find("implicitScaling"));
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
#include "shared/source/helpers/gfx_core_helper.h"
|
||||
#include "shared/source/helpers/local_id_gen.h"
|
||||
#include "shared/source/helpers/ptr_math.h"
|
||||
#include "shared/source/kernel/grf_config.h"
|
||||
#include "shared/test/common/helpers/default_hw_info.h"
|
||||
#include "shared/test/common/helpers/unit_test_helper.h"
|
||||
#include "shared/test/common/mocks/mock_execution_environment.h"
|
||||
@@ -115,7 +116,7 @@ TEST(LocalIdTest, givenVariadicGrfSizeWhenLocalSizesAreEmittedThenUseFullRowSize
|
||||
|
||||
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
|
||||
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
|
||||
generateLocalIDs(localIdsPtr.get(), 16u, localSizes, dimensionsOrder, false, 64u, rootDeviceEnvironment);
|
||||
generateLocalIDs(localIdsPtr.get(), 16u, localSizes, dimensionsOrder, false, 64u, GrfConfig::defaultGrfNumber, rootDeviceEnvironment);
|
||||
EXPECT_EQ(localIdsView[0], 0u);
|
||||
EXPECT_EQ(localIdsView[1], 1u);
|
||||
EXPECT_EQ(localIdsView[2], 0u);
|
||||
@@ -301,6 +302,7 @@ struct LocalIDFixture : ::testing::TestWithParam<std::tuple<int, int, int, int,
|
||||
uint32_t localWorkSize;
|
||||
uint32_t simd;
|
||||
uint32_t grfSize;
|
||||
uint32_t numGrf = GrfConfig::defaultGrfNumber;
|
||||
|
||||
// Provide support for a max LWS of 256
|
||||
// 32 threads @ SIMD8
|
||||
@@ -313,7 +315,7 @@ HWTEST_P(LocalIDFixture, WhenGeneratingLocalIdsThenIdsAreWithinLimits) {
|
||||
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
|
||||
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
|
||||
generateLocalIDs(buffer, simd, std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizeX), static_cast<uint16_t>(localWorkSizeY), static_cast<uint16_t>(localWorkSizeZ)}},
|
||||
std::array<uint8_t, 3>{{0, 1, 2}}, false, grfSize, rootDeviceEnvironment);
|
||||
std::array<uint8_t, 3>{{0, 1, 2}}, false, grfSize, numGrf, rootDeviceEnvironment);
|
||||
validateIDWithinLimits(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, UnitTestHelper<FamilyType>::useFullRowForLocalIdsGeneration);
|
||||
}
|
||||
|
||||
@@ -321,7 +323,7 @@ HWTEST_P(LocalIDFixture, WhenGeneratingLocalIdsThenAllWorkItemsCovered) {
|
||||
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
|
||||
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
|
||||
generateLocalIDs(buffer, simd, std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizeX), static_cast<uint16_t>(localWorkSizeY), static_cast<uint16_t>(localWorkSizeZ)}},
|
||||
std::array<uint8_t, 3>{{0, 1, 2}}, false, grfSize, rootDeviceEnvironment);
|
||||
std::array<uint8_t, 3>{{0, 1, 2}}, false, grfSize, numGrf, rootDeviceEnvironment);
|
||||
validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, UnitTestHelper<FamilyType>::useFullRowForLocalIdsGeneration);
|
||||
}
|
||||
|
||||
@@ -330,7 +332,7 @@ HWTEST_P(LocalIDFixture, WhenWalkOrderIsXyzThenProperLocalIdsAreGenerated) {
|
||||
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
|
||||
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
|
||||
generateLocalIDs(buffer, simd, std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizeX), static_cast<uint16_t>(localWorkSizeY), static_cast<uint16_t>(localWorkSizeZ)}},
|
||||
dimensionsOrder, false, grfSize, rootDeviceEnvironment);
|
||||
dimensionsOrder, false, grfSize, numGrf, rootDeviceEnvironment);
|
||||
validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, UnitTestHelper<FamilyType>::useFullRowForLocalIdsGeneration);
|
||||
validateWalkOrder(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, dimensionsOrder);
|
||||
}
|
||||
@@ -340,7 +342,7 @@ HWTEST_P(LocalIDFixture, WhenWalkOrderIsYxzThenProperLocalIdsAreGenerated) {
|
||||
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
|
||||
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
|
||||
generateLocalIDs(buffer, simd, std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizeX), static_cast<uint16_t>(localWorkSizeY), static_cast<uint16_t>(localWorkSizeZ)}},
|
||||
dimensionsOrder, false, grfSize, rootDeviceEnvironment);
|
||||
dimensionsOrder, false, grfSize, numGrf, rootDeviceEnvironment);
|
||||
validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, UnitTestHelper<FamilyType>::useFullRowForLocalIdsGeneration);
|
||||
validateWalkOrder(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, dimensionsOrder);
|
||||
}
|
||||
@@ -350,7 +352,7 @@ HWTEST_P(LocalIDFixture, WhenWalkOrderIsZyxThenProperLocalIdsAreGenerated) {
|
||||
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
|
||||
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
|
||||
generateLocalIDs(buffer, simd, std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizeX), static_cast<uint16_t>(localWorkSizeY), static_cast<uint16_t>(localWorkSizeZ)}},
|
||||
dimensionsOrder, false, grfSize, rootDeviceEnvironment);
|
||||
dimensionsOrder, false, grfSize, numGrf, rootDeviceEnvironment);
|
||||
validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, UnitTestHelper<FamilyType>::useFullRowForLocalIdsGeneration);
|
||||
validateWalkOrder(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, dimensionsOrder);
|
||||
}
|
||||
@@ -392,7 +394,7 @@ struct LocalIdsLayoutForImagesTest : ::testing::TestWithParam<std::tuple<uint16_
|
||||
EXPECT_TRUE(isCompatibleWithLayoutForImages(localWorkSize, dimensionsOrder, simd));
|
||||
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
|
||||
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
|
||||
generateLocalIDs(buffer, simd, localWorkSize, dimensionsOrder, true, grfSize, rootDeviceEnvironment);
|
||||
generateLocalIDs(buffer, simd, localWorkSize, dimensionsOrder, true, grfSize, numGrfs, rootDeviceEnvironment);
|
||||
}
|
||||
void validateGRF() {
|
||||
uint32_t totalLocalIds = localWorkSize.at(0) * localWorkSize.at(1);
|
||||
@@ -494,8 +496,8 @@ TEST_P(LocalIdsLayoutTest, givenLocalWorkgroupSize4x4x1WhenGenerateLocalIdsThenH
|
||||
memset(buffer2, 0xff, size);
|
||||
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
|
||||
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
|
||||
generateLocalIDs(buffer1, simd, localWorkSize, dimensionsOrder, false, grfSize, rootDeviceEnvironment);
|
||||
generateLocalIDs(buffer2, simd, localWorkSize, dimensionsOrder, true, grfSize, rootDeviceEnvironment);
|
||||
generateLocalIDs(buffer1, simd, localWorkSize, dimensionsOrder, false, grfSize, GrfConfig::defaultGrfNumber, rootDeviceEnvironment);
|
||||
generateLocalIDs(buffer2, simd, localWorkSize, dimensionsOrder, true, grfSize, GrfConfig::defaultGrfNumber, rootDeviceEnvironment);
|
||||
|
||||
for (auto i = 0u; i < elemsInBuffer / rowWidth; i++) {
|
||||
for (auto j = 0u; j < rowWidth; j++) {
|
||||
|
||||
@@ -79,7 +79,7 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInP
|
||||
|
||||
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
|
||||
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
|
||||
auto localIdsSize = alignUp(PerThreadDataHelper::getPerThreadDataSizeTotal(implicitArgs.simdWidth, 32u /* grfSize */, 3u /* num channels */, totalWorkgroupSize, false, rootDeviceEnvironment), MemoryConstants::cacheLineSize);
|
||||
auto localIdsSize = alignUp(PerThreadDataHelper::getPerThreadDataSizeTotal(implicitArgs.simdWidth, 32u /* grfSize */, GrfConfig::defaultGrfNumber /* numGrf */, 3u /* num channels */, totalWorkgroupSize, false, rootDeviceEnvironment), MemoryConstants::cacheLineSize);
|
||||
EXPECT_EQ(localIdsSize + ImplicitArgs::getSize(), ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, rootDeviceEnvironment));
|
||||
}
|
||||
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
#include "shared/source/helpers/gfx_core_helper.h"
|
||||
#include "shared/source/helpers/hw_info.h"
|
||||
#include "shared/source/helpers/per_thread_data.h"
|
||||
#include "shared/source/kernel/grf_config.h"
|
||||
#include "shared/source/kernel/local_ids_cache.h"
|
||||
#include "shared/test/common/helpers/default_hw_info.h"
|
||||
#include "shared/test/common/mocks/mock_execution_environment.h"
|
||||
@@ -22,7 +23,7 @@ class MockLocalIdsCache : public NEO::LocalIdsCache {
|
||||
using Base::Base;
|
||||
using Base::cache;
|
||||
MockLocalIdsCache(size_t cacheSize) : MockLocalIdsCache(cacheSize, 32u){};
|
||||
MockLocalIdsCache(size_t cacheSize, uint8_t simd) : Base(cacheSize, {0, 1, 2}, simd, 32, false){};
|
||||
MockLocalIdsCache(size_t cacheSize, uint8_t simd) : Base(cacheSize, {0, 1, 2}, GrfConfig::defaultGrfNumber, simd, 32, false){};
|
||||
};
|
||||
struct LocalIdsCacheFixture {
|
||||
void setUp() {
|
||||
|
||||
Reference in New Issue
Block a user