refactor: add param rootDeviceEnvironment to calculateNumThreadsPerThreadGroup
Signed-off-by: Katarzyna Cencelewska <katarzyna.cencelewska@intel.com>
This commit is contained in:
parent
ec009cf9e3
commit
dd1d52259e
|
@ -381,13 +381,13 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
|
||||||
auto &rootDeviceEnvironment = module->getDevice()->getNEODevice()->getRootDeviceEnvironment();
|
auto &rootDeviceEnvironment = module->getDevice()->getNEODevice()->getRootDeviceEnvironment();
|
||||||
auto &gfxCoreHelper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
|
auto &gfxCoreHelper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
|
||||||
this->numThreadsPerThreadGroup = gfxCoreHelper.calculateNumThreadsPerThreadGroup(
|
this->numThreadsPerThreadGroup = gfxCoreHelper.calculateNumThreadsPerThreadGroup(
|
||||||
simdSize, static_cast<uint32_t>(itemsInGroup), grfSize, !kernelRequiresGenerationOfLocalIdsByRuntime);
|
simdSize, static_cast<uint32_t>(itemsInGroup), grfSize, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);
|
||||||
|
|
||||||
if (kernelRequiresGenerationOfLocalIdsByRuntime) {
|
if (kernelRequiresGenerationOfLocalIdsByRuntime) {
|
||||||
auto grfSize = this->module->getDevice()->getHwInfo().capabilityTable.grfSize;
|
auto grfSize = this->module->getDevice()->getHwInfo().capabilityTable.grfSize;
|
||||||
uint32_t perThreadDataSizeForWholeThreadGroupNeeded =
|
uint32_t perThreadDataSizeForWholeThreadGroupNeeded =
|
||||||
static_cast<uint32_t>(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(
|
static_cast<uint32_t>(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(
|
||||||
simdSize, grfSize, numChannels, itemsInGroup, !kernelRequiresGenerationOfLocalIdsByRuntime, gfxCoreHelper));
|
simdSize, grfSize, numChannels, itemsInGroup, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment));
|
||||||
if (perThreadDataSizeForWholeThreadGroupNeeded >
|
if (perThreadDataSizeForWholeThreadGroupNeeded >
|
||||||
perThreadDataSizeForWholeThreadGroupAllocated) {
|
perThreadDataSizeForWholeThreadGroupAllocated) {
|
||||||
alignedFree(perThreadDataForWholeThreadGroup);
|
alignedFree(perThreadDataForWholeThreadGroup);
|
||||||
|
@ -405,7 +405,7 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
|
||||||
static_cast<uint16_t>(groupSizeY),
|
static_cast<uint16_t>(groupSizeY),
|
||||||
static_cast<uint16_t>(groupSizeZ)}},
|
static_cast<uint16_t>(groupSizeZ)}},
|
||||||
std::array<uint8_t, 3>{{0, 1, 2}},
|
std::array<uint8_t, 3>{{0, 1, 2}},
|
||||||
false, grfSize, gfxCoreHelper);
|
false, grfSize, rootDeviceEnvironment);
|
||||||
}
|
}
|
||||||
|
|
||||||
this->perThreadDataSize = perThreadDataSizeForWholeThreadGroup / numThreadsPerThreadGroup;
|
this->perThreadDataSize = perThreadDataSizeForWholeThreadGroup / numThreadsPerThreadGroup;
|
||||||
|
@ -902,8 +902,8 @@ ze_result_t KernelImp::getProperties(ze_kernel_properties_t *pKernelProperties)
|
||||||
memset(pKernelProperties->uuid.mid, 0, ZE_MAX_MODULE_UUID_SIZE);
|
memset(pKernelProperties->uuid.mid, 0, ZE_MAX_MODULE_UUID_SIZE);
|
||||||
|
|
||||||
uint32_t maxKernelWorkGroupSize = static_cast<uint32_t>(this->module->getMaxGroupSize(kernelDescriptor));
|
uint32_t maxKernelWorkGroupSize = static_cast<uint32_t>(this->module->getMaxGroupSize(kernelDescriptor));
|
||||||
|
const auto &rootDeviceEnvironment = this->module->getDevice()->getNEODevice()->getRootDeviceEnvironment();
|
||||||
maxKernelWorkGroupSize = gfxCoreHelper.adjustMaxWorkGroupSize(kernelDescriptor.kernelAttributes.numGrfRequired, kernelDescriptor.kernelAttributes.simdSize, !kernelRequiresGenerationOfLocalIdsByRuntime, maxKernelWorkGroupSize);
|
maxKernelWorkGroupSize = gfxCoreHelper.adjustMaxWorkGroupSize(kernelDescriptor.kernelAttributes.numGrfRequired, kernelDescriptor.kernelAttributes.simdSize, !kernelRequiresGenerationOfLocalIdsByRuntime, maxKernelWorkGroupSize, rootDeviceEnvironment);
|
||||||
pKernelProperties->maxNumSubgroups = maxKernelWorkGroupSize / kernelDescriptor.kernelAttributes.simdSize;
|
pKernelProperties->maxNumSubgroups = maxKernelWorkGroupSize / kernelDescriptor.kernelAttributes.simdSize;
|
||||||
|
|
||||||
void *pNext = pKernelProperties->pNext;
|
void *pNext = pKernelProperties->pNext;
|
||||||
|
|
|
@ -996,8 +996,9 @@ struct CmdlistAppendLaunchKernelWithImplicitArgsTests : CmdlistAppendLaunchKerne
|
||||||
result = commandList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false);
|
result = commandList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false);
|
||||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||||
|
|
||||||
const auto &gfxCoreHelper = device->getGfxCoreHelper();
|
const auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironment();
|
||||||
implicitArgsProgrammingSize = ImplicitArgsHelper::getSizeForImplicitArgsPatching(&expectedImplicitArgs, *kernelDescriptor, !kernelRequiresGenerationOfLocalIdsByRuntime, gfxCoreHelper);
|
const auto &gfxCoreHelper = device->getNEODevice()->getGfxCoreHelper();
|
||||||
|
implicitArgsProgrammingSize = ImplicitArgsHelper::getSizeForImplicitArgsPatching(&expectedImplicitArgs, *kernelDescriptor, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);
|
||||||
auto sizeCrossThreadData = kernel->getCrossThreadDataSize();
|
auto sizeCrossThreadData = kernel->getCrossThreadDataSize();
|
||||||
auto sizePerThreadDataForWholeGroup = kernel->getPerThreadDataSizeForWholeThreadGroup();
|
auto sizePerThreadDataForWholeGroup = kernel->getPerThreadDataSizeForWholeThreadGroup();
|
||||||
EXPECT_EQ(indirectHeap->getUsed(), alignUp(sizeCrossThreadData + sizePerThreadDataForWholeGroup + implicitArgsProgrammingSize, gfxCoreHelper.getIOHAlignment()));
|
EXPECT_EQ(indirectHeap->getUsed(), alignUp(sizeCrossThreadData + sizePerThreadDataForWholeGroup + implicitArgsProgrammingSize, gfxCoreHelper.getIOHAlignment()));
|
||||||
|
@ -1029,11 +1030,11 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CmdlistAppendLaunchKernelWithImplicitArgsTests, giv
|
||||||
|
|
||||||
auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth);
|
auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth);
|
||||||
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgs::getSize(), MemoryConstants::cacheLineSize);
|
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgs::getSize(), MemoryConstants::cacheLineSize);
|
||||||
const auto &gfxCoreHelper = device->getNEODevice()->getGfxCoreHelper();
|
const auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironment();
|
||||||
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize, gfxCoreHelper);
|
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize, rootDeviceEnvironment);
|
||||||
|
|
||||||
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgs::getSize();
|
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgs::getSize();
|
||||||
size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize, !kernelRequiresGenerationOfLocalIdsByRuntime, gfxCoreHelper);
|
size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);
|
||||||
|
|
||||||
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
|
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
|
||||||
alignedFree(expectedLocalIds);
|
alignedFree(expectedLocalIds);
|
||||||
|
@ -1075,11 +1076,11 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CmdlistAppendLaunchKernelWithImplicitArgsTests, giv
|
||||||
|
|
||||||
auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth);
|
auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth);
|
||||||
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgs::getSize(), MemoryConstants::cacheLineSize);
|
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgs::getSize(), MemoryConstants::cacheLineSize);
|
||||||
const auto &gfxCoreHelper = device->getNEODevice()->getGfxCoreHelper();
|
const auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironment();
|
||||||
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize, gfxCoreHelper);
|
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize, rootDeviceEnvironment);
|
||||||
|
|
||||||
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgs::getSize();
|
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgs::getSize();
|
||||||
size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize, !kernelRequiresGenerationOfLocalIdsByRuntime, gfxCoreHelper);
|
size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);
|
||||||
|
|
||||||
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
|
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
|
||||||
alignedFree(expectedLocalIds);
|
alignedFree(expectedLocalIds);
|
||||||
|
|
|
@ -732,8 +732,8 @@ struct CommandListAppendLaunchKernelWithImplicitArgs : CommandListAppendLaunchKe
|
||||||
template <typename FamilyType>
|
template <typename FamilyType>
|
||||||
uint64_t getIndirectHeapOffsetForImplicitArgsBuffer(const Mock<::L0::KernelImp> &kernel) {
|
uint64_t getIndirectHeapOffsetForImplicitArgsBuffer(const Mock<::L0::KernelImp> &kernel) {
|
||||||
if (FamilyType::supportsCmdSet(IGFX_XE_HP_CORE)) {
|
if (FamilyType::supportsCmdSet(IGFX_XE_HP_CORE)) {
|
||||||
const auto &gfxCoreHelper = device->getGfxCoreHelper();
|
const auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironment();
|
||||||
auto implicitArgsProgrammingSize = ImplicitArgsHelper::getSizeForImplicitArgsPatching(kernel.pImplicitArgs.get(), kernel.getKernelDescriptor(), !kernel.kernelRequiresGenerationOfLocalIdsByRuntime, gfxCoreHelper);
|
auto implicitArgsProgrammingSize = ImplicitArgsHelper::getSizeForImplicitArgsPatching(kernel.pImplicitArgs.get(), kernel.getKernelDescriptor(), !kernel.kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);
|
||||||
return implicitArgsProgrammingSize - ImplicitArgs::getSize();
|
return implicitArgsProgrammingSize - ImplicitArgs::getSize();
|
||||||
} else {
|
} else {
|
||||||
return 0u;
|
return 0u;
|
||||||
|
|
|
@ -304,7 +304,9 @@ TEST_F(KernelImpSetGroupSizeTest, givenLocalIdGenerationByRuntimeEnabledWhenSett
|
||||||
mockKernel.kernelRequiresGenerationOfLocalIdsByRuntime = true; // although it is enabled for SIMD 1, make sure it is enforced
|
mockKernel.kernelRequiresGenerationOfLocalIdsByRuntime = true; // although it is enabled for SIMD 1, make sure it is enforced
|
||||||
mockKernel.descriptor.kernelAttributes.numLocalIdChannels = 3;
|
mockKernel.descriptor.kernelAttributes.numLocalIdChannels = 3;
|
||||||
mockKernel.module = &mockModule;
|
mockKernel.module = &mockModule;
|
||||||
auto grfSize = mockModule.getDevice()->getHwInfo().capabilityTable.grfSize;
|
const auto &device = mockModule.getDevice();
|
||||||
|
auto grfSize = device->getHwInfo().capabilityTable.grfSize;
|
||||||
|
const auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironment();
|
||||||
uint32_t groupSize[3] = {2, 3, 5};
|
uint32_t groupSize[3] = {2, 3, 5};
|
||||||
auto ret = mockKernel.setGroupSize(groupSize[0], groupSize[1], groupSize[2]);
|
auto ret = mockKernel.setGroupSize(groupSize[0], groupSize[1], groupSize[2]);
|
||||||
EXPECT_EQ(ZE_RESULT_SUCCESS, ret);
|
EXPECT_EQ(ZE_RESULT_SUCCESS, ret);
|
||||||
|
@ -314,7 +316,8 @@ TEST_F(KernelImpSetGroupSizeTest, givenLocalIdGenerationByRuntimeEnabledWhenSett
|
||||||
mockKernel.descriptor.kernelAttributes.simdSize,
|
mockKernel.descriptor.kernelAttributes.simdSize,
|
||||||
groupSize[0] * groupSize[1] * groupSize[2],
|
groupSize[0] * groupSize[1] * groupSize[2],
|
||||||
grfSize,
|
grfSize,
|
||||||
mockKernel.kernelRequiresGenerationOfLocalIdsByRuntime);
|
mockKernel.kernelRequiresGenerationOfLocalIdsByRuntime,
|
||||||
|
rootDeviceEnvironment);
|
||||||
auto perThreadDataSizeForWholeTGNeeded =
|
auto perThreadDataSizeForWholeTGNeeded =
|
||||||
static_cast<uint32_t>(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(
|
static_cast<uint32_t>(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(
|
||||||
mockKernel.descriptor.kernelAttributes.simdSize,
|
mockKernel.descriptor.kernelAttributes.simdSize,
|
||||||
|
@ -322,7 +325,7 @@ TEST_F(KernelImpSetGroupSizeTest, givenLocalIdGenerationByRuntimeEnabledWhenSett
|
||||||
mockKernel.descriptor.kernelAttributes.numLocalIdChannels,
|
mockKernel.descriptor.kernelAttributes.numLocalIdChannels,
|
||||||
groupSize[0] * groupSize[1] * groupSize[2],
|
groupSize[0] * groupSize[1] * groupSize[2],
|
||||||
!mockKernel.kernelRequiresGenerationOfLocalIdsByRuntime,
|
!mockKernel.kernelRequiresGenerationOfLocalIdsByRuntime,
|
||||||
gfxHelper));
|
rootDeviceEnvironment));
|
||||||
|
|
||||||
EXPECT_EQ(numThreadsPerTG, mockKernel.getNumThreadsPerThreadGroup());
|
EXPECT_EQ(numThreadsPerTG, mockKernel.getNumThreadsPerThreadGroup());
|
||||||
EXPECT_EQ((perThreadDataSizeForWholeTGNeeded / numThreadsPerTG), mockKernel.perThreadDataSize);
|
EXPECT_EQ((perThreadDataSizeForWholeTGNeeded / numThreadsPerTG), mockKernel.perThreadDataSize);
|
||||||
|
@ -1692,9 +1695,9 @@ TEST_F(KernelPropertiesTests, whenPassingKernelMaxGroupSizePropertiesStructToGet
|
||||||
|
|
||||||
ze_result_t res = kernel->getProperties(&kernelProperties);
|
ze_result_t res = kernel->getProperties(&kernelProperties);
|
||||||
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
|
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
|
||||||
|
auto &device = *module->getDevice();
|
||||||
auto &gfxCoreHelper = module->getDevice()->getGfxCoreHelper();
|
auto &gfxCoreHelper = device.getGfxCoreHelper();
|
||||||
uint32_t maxKernelWorkGroupSize = gfxCoreHelper.adjustMaxWorkGroupSize(kernelDescriptor.kernelAttributes.numGrfRequired, kernelDescriptor.kernelAttributes.simdSize, false, static_cast<uint32_t>(this->module->getMaxGroupSize(kernelDescriptor)));
|
uint32_t maxKernelWorkGroupSize = gfxCoreHelper.adjustMaxWorkGroupSize(kernelDescriptor.kernelAttributes.numGrfRequired, kernelDescriptor.kernelAttributes.simdSize, false, static_cast<uint32_t>(this->module->getMaxGroupSize(kernelDescriptor)), device.getNEODevice()->getRootDeviceEnvironment());
|
||||||
EXPECT_EQ(maxKernelWorkGroupSize, maxGroupSizeProperties.maxGroupSize);
|
EXPECT_EQ(maxKernelWorkGroupSize, maxGroupSizeProperties.maxGroupSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -33,7 +33,8 @@ template size_t NEO::HardwareCommandsHelper<NEO::FamilyType>::sendCrossThreadDat
|
||||||
bool inlineDataProgrammingRequired,
|
bool inlineDataProgrammingRequired,
|
||||||
FamilyType::DefaultWalkerType *walkerCmd,
|
FamilyType::DefaultWalkerType *walkerCmd,
|
||||||
uint32_t &sizeCrossThreadData,
|
uint32_t &sizeCrossThreadData,
|
||||||
uint64_t scratchAddress);
|
uint64_t scratchAddress,
|
||||||
|
const RootDeviceEnvironment &rootDeviceEnvironment);
|
||||||
|
|
||||||
template size_t NEO::HardwareCommandsHelper<NEO::FamilyType>::sendInterfaceDescriptorData<NEO::FamilyType::DefaultWalkerType, NEO::FamilyType::INTERFACE_DESCRIPTOR_DATA>(
|
template size_t NEO::HardwareCommandsHelper<NEO::FamilyType>::sendInterfaceDescriptorData<NEO::FamilyType::DefaultWalkerType, NEO::FamilyType::INTERFACE_DESCRIPTOR_DATA>(
|
||||||
const IndirectHeap &indirectHeap,
|
const IndirectHeap &indirectHeap,
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (C) 2019-2023 Intel Corporation
|
* Copyright (C) 2019-2024 Intel Corporation
|
||||||
*
|
*
|
||||||
* SPDX-License-Identifier: MIT
|
* SPDX-License-Identifier: MIT
|
||||||
*
|
*
|
||||||
|
@ -72,7 +72,8 @@ struct HardwareCommandsHelper : public PerThreadDataHelper {
|
||||||
bool inlineDataProgrammingRequired,
|
bool inlineDataProgrammingRequired,
|
||||||
WalkerType *walkerCmd,
|
WalkerType *walkerCmd,
|
||||||
uint32_t &sizeCrossThreadData,
|
uint32_t &sizeCrossThreadData,
|
||||||
uint64_t scratchAddress);
|
uint64_t scratchAddress,
|
||||||
|
const RootDeviceEnvironment &rootDeviceEnvironment);
|
||||||
|
|
||||||
template <typename WalkerType, typename InterfaceDescriptorType>
|
template <typename WalkerType, typename InterfaceDescriptorType>
|
||||||
static size_t sendIndirectState(
|
static size_t sendIndirectState(
|
||||||
|
@ -111,7 +112,8 @@ struct HardwareCommandsHelper : public PerThreadDataHelper {
|
||||||
const Kernel &kernel);
|
const Kernel &kernel);
|
||||||
static size_t getSizeRequiredIOH(
|
static size_t getSizeRequiredIOH(
|
||||||
const Kernel &kernel,
|
const Kernel &kernel,
|
||||||
const size_t localWorkSizes[3]);
|
const size_t localWorkSizes[3],
|
||||||
|
const RootDeviceEnvironment &rootDeviceEnvironment);
|
||||||
static size_t getSizeRequiredSSH(
|
static size_t getSizeRequiredSSH(
|
||||||
const Kernel &kernel);
|
const Kernel &kernel);
|
||||||
|
|
||||||
|
|
|
@ -48,12 +48,11 @@ size_t HardwareCommandsHelper<GfxFamily>::getSizeRequiredDSH(const Kernel &kerne
|
||||||
|
|
||||||
template <typename GfxFamily>
|
template <typename GfxFamily>
|
||||||
size_t HardwareCommandsHelper<GfxFamily>::getSizeRequiredIOH(const Kernel &kernel,
|
size_t HardwareCommandsHelper<GfxFamily>::getSizeRequiredIOH(const Kernel &kernel,
|
||||||
const size_t localWorkSizes[3]) {
|
const size_t localWorkSizes[3], const RootDeviceEnvironment &rootDeviceEnvironment) {
|
||||||
auto localWorkSize = Math::computeTotalElementsCount(localWorkSizes);
|
auto localWorkSize = Math::computeTotalElementsCount(localWorkSizes);
|
||||||
typedef typename GfxFamily::DefaultWalkerType DefaultWalkerType;
|
typedef typename GfxFamily::DefaultWalkerType DefaultWalkerType;
|
||||||
const auto &kernelDescriptor = kernel.getDescriptor();
|
const auto &kernelDescriptor = kernel.getDescriptor();
|
||||||
const auto &hwInfo = kernel.getHardwareInfo();
|
const auto &hwInfo = kernel.getHardwareInfo();
|
||||||
const auto &gfxCoreHelper = kernel.getGfxCoreHelper();
|
|
||||||
|
|
||||||
auto numChannels = kernelDescriptor.kernelAttributes.numLocalIdChannels;
|
auto numChannels = kernelDescriptor.kernelAttributes.numLocalIdChannels;
|
||||||
uint32_t grfSize = hwInfo.capabilityTable.grfSize;
|
uint32_t grfSize = hwInfo.capabilityTable.grfSize;
|
||||||
|
@ -70,11 +69,11 @@ size_t HardwareCommandsHelper<GfxFamily>::getSizeRequiredIOH(const Kernel &kerne
|
||||||
requiredWalkOrder,
|
requiredWalkOrder,
|
||||||
simdSize);
|
simdSize);
|
||||||
auto size = kernel.getCrossThreadDataSize() +
|
auto size = kernel.getCrossThreadDataSize() +
|
||||||
getPerThreadDataSizeTotal(simdSize, grfSize, numChannels, localWorkSize, isHwLocalIdGeneration, gfxCoreHelper);
|
getPerThreadDataSizeTotal(simdSize, grfSize, numChannels, localWorkSize, isHwLocalIdGeneration, rootDeviceEnvironment);
|
||||||
|
|
||||||
auto pImplicitArgs = kernel.getImplicitArgs();
|
auto pImplicitArgs = kernel.getImplicitArgs();
|
||||||
if (pImplicitArgs) {
|
if (pImplicitArgs) {
|
||||||
size += ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, isHwLocalIdGeneration, gfxCoreHelper);
|
size += ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, isHwLocalIdGeneration, rootDeviceEnvironment);
|
||||||
}
|
}
|
||||||
return alignUp(size, DefaultWalkerType::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
|
return alignUp(size, DefaultWalkerType::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
|
||||||
}
|
}
|
||||||
|
@ -110,7 +109,8 @@ size_t HardwareCommandsHelper<GfxFamily>::getTotalSizeRequiredIOH(
|
||||||
const MultiDispatchInfo &multiDispatchInfo) {
|
const MultiDispatchInfo &multiDispatchInfo) {
|
||||||
return getSizeRequired(multiDispatchInfo, [](const DispatchInfo &dispatchInfo) { return getSizeRequiredIOH(
|
return getSizeRequired(multiDispatchInfo, [](const DispatchInfo &dispatchInfo) { return getSizeRequiredIOH(
|
||||||
*dispatchInfo.getKernel(),
|
*dispatchInfo.getKernel(),
|
||||||
dispatchInfo.getLocalWorkgroupSize().values); });
|
dispatchInfo.getLocalWorkgroupSize().values,
|
||||||
|
dispatchInfo.getClDevice().getRootDeviceEnvironment()); });
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename GfxFamily>
|
template <typename GfxFamily>
|
||||||
|
@ -270,14 +270,14 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
|
||||||
auto &gfxCoreHelper = device.getGfxCoreHelper();
|
auto &gfxCoreHelper = device.getGfxCoreHelper();
|
||||||
auto grfSize = kernel.getDescriptor().kernelAttributes.numGrfRequired;
|
auto grfSize = kernel.getDescriptor().kernelAttributes.numGrfRequired;
|
||||||
auto localWorkItems = localWorkSize[0] * localWorkSize[1] * localWorkSize[2];
|
auto localWorkItems = localWorkSize[0] * localWorkSize[1] * localWorkSize[2];
|
||||||
auto threadsPerThreadGroup = gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkItems), grfSize, !localIdsGenerationByRuntime);
|
auto threadsPerThreadGroup = gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkItems), grfSize, !localIdsGenerationByRuntime, device.getRootDeviceEnvironment());
|
||||||
|
|
||||||
uint32_t sizeCrossThreadData = kernel.getCrossThreadDataSize();
|
uint32_t sizeCrossThreadData = kernel.getCrossThreadDataSize();
|
||||||
|
|
||||||
auto inlineDataProgrammingRequired = EncodeDispatchKernel<GfxFamily>::inlineDataProgrammingRequired(kernel.getKernelInfo().kernelDescriptor);
|
auto inlineDataProgrammingRequired = EncodeDispatchKernel<GfxFamily>::inlineDataProgrammingRequired(kernel.getKernelInfo().kernelDescriptor);
|
||||||
size_t offsetCrossThreadData = HardwareCommandsHelper<GfxFamily>::sendCrossThreadData<WalkerType>(
|
size_t offsetCrossThreadData = HardwareCommandsHelper<GfxFamily>::sendCrossThreadData<WalkerType>(
|
||||||
ioh, kernel, inlineDataProgrammingRequired,
|
ioh, kernel, inlineDataProgrammingRequired,
|
||||||
walkerCmd, sizeCrossThreadData, scratchAddress);
|
walkerCmd, sizeCrossThreadData, scratchAddress, device.getRootDeviceEnvironment());
|
||||||
|
|
||||||
size_t sizePerThreadDataTotal = 0;
|
size_t sizePerThreadDataTotal = 0;
|
||||||
size_t sizePerThreadData = 0;
|
size_t sizePerThreadData = 0;
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (C) 2020-2023 Intel Corporation
|
* Copyright (C) 2020-2024 Intel Corporation
|
||||||
*
|
*
|
||||||
* SPDX-License-Identifier: MIT
|
* SPDX-License-Identifier: MIT
|
||||||
*
|
*
|
||||||
|
@ -78,20 +78,21 @@ size_t HardwareCommandsHelper<GfxFamily>::sendCrossThreadData(
|
||||||
bool inlineDataProgrammingRequired,
|
bool inlineDataProgrammingRequired,
|
||||||
WalkerType *walkerCmd,
|
WalkerType *walkerCmd,
|
||||||
uint32_t &sizeCrossThreadData,
|
uint32_t &sizeCrossThreadData,
|
||||||
uint64_t scratchAddress) {
|
uint64_t scratchAddress,
|
||||||
|
const RootDeviceEnvironment &rootDeviceEnvironment) {
|
||||||
indirectHeap.align(WalkerType::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
|
indirectHeap.align(WalkerType::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
|
||||||
|
|
||||||
auto pImplicitArgs = kernel.getImplicitArgs();
|
auto pImplicitArgs = kernel.getImplicitArgs();
|
||||||
if (pImplicitArgs) {
|
if (pImplicitArgs) {
|
||||||
const auto &kernelDescriptor = kernel.getDescriptor();
|
const auto &kernelDescriptor = kernel.getDescriptor();
|
||||||
const auto &gfxCoreHelper = kernel.getGfxCoreHelper();
|
|
||||||
auto isHwLocalIdGeneration = false;
|
auto isHwLocalIdGeneration = false;
|
||||||
auto sizeForImplicitArgsProgramming = ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, isHwLocalIdGeneration, gfxCoreHelper);
|
auto sizeForImplicitArgsProgramming = ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, isHwLocalIdGeneration, rootDeviceEnvironment);
|
||||||
|
|
||||||
auto implicitArgsGpuVA = indirectHeap.getGraphicsAllocation()->getGpuAddress() + indirectHeap.getUsed();
|
auto implicitArgsGpuVA = indirectHeap.getGraphicsAllocation()->getGpuAddress() + indirectHeap.getUsed();
|
||||||
auto ptrToPatchImplicitArgs = indirectHeap.getSpace(sizeForImplicitArgsProgramming);
|
auto ptrToPatchImplicitArgs = indirectHeap.getSpace(sizeForImplicitArgsProgramming);
|
||||||
|
|
||||||
ImplicitArgsHelper::patchImplicitArgs(ptrToPatchImplicitArgs, *pImplicitArgs, kernelDescriptor, {}, gfxCoreHelper);
|
ImplicitArgsHelper::patchImplicitArgs(ptrToPatchImplicitArgs, *pImplicitArgs, kernelDescriptor, {}, rootDeviceEnvironment);
|
||||||
|
|
||||||
auto implicitArgsCrossThreadPtr = ptrOffset(reinterpret_cast<uint64_t *>(kernel.getCrossThreadData()), kernelDescriptor.payloadMappings.implicitArgs.implicitArgsBuffer);
|
auto implicitArgsCrossThreadPtr = ptrOffset(reinterpret_cast<uint64_t *>(kernel.getCrossThreadData()), kernelDescriptor.payloadMappings.implicitArgs.implicitArgsBuffer);
|
||||||
*implicitArgsCrossThreadPtr = implicitArgsGpuVA;
|
*implicitArgsCrossThreadPtr = implicitArgsGpuVA;
|
||||||
|
|
|
@ -59,7 +59,8 @@ size_t HardwareCommandsHelper<GfxFamily>::sendCrossThreadData(
|
||||||
bool inlineDataProgrammingRequired,
|
bool inlineDataProgrammingRequired,
|
||||||
WalkerType *walkerCmd,
|
WalkerType *walkerCmd,
|
||||||
uint32_t &sizeCrossThreadData,
|
uint32_t &sizeCrossThreadData,
|
||||||
[[maybe_unused]] uint64_t scratchAddress) {
|
[[maybe_unused]] uint64_t scratchAddress,
|
||||||
|
const RootDeviceEnvironment &rootDeviceEnvironment) {
|
||||||
|
|
||||||
indirectHeap.align(GfxFamily::indirectDataAlignment);
|
indirectHeap.align(GfxFamily::indirectDataAlignment);
|
||||||
|
|
||||||
|
@ -87,15 +88,14 @@ size_t HardwareCommandsHelper<GfxFamily>::sendCrossThreadData(
|
||||||
requiredWalkOrder,
|
requiredWalkOrder,
|
||||||
kernelDescriptor.kernelAttributes.simdSize);
|
kernelDescriptor.kernelAttributes.simdSize);
|
||||||
|
|
||||||
const auto &gfxCoreHelper = kernel.getGfxCoreHelper();
|
auto sizeForImplicitArgsProgramming = ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, !generationOfLocalIdsByRuntime, rootDeviceEnvironment);
|
||||||
auto sizeForImplicitArgsProgramming = ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, !generationOfLocalIdsByRuntime, gfxCoreHelper);
|
|
||||||
|
|
||||||
auto sizeForLocalIdsProgramming = sizeForImplicitArgsProgramming - ImplicitArgs::getSize();
|
auto sizeForLocalIdsProgramming = sizeForImplicitArgsProgramming - ImplicitArgs::getSize();
|
||||||
offsetCrossThreadData += sizeForLocalIdsProgramming;
|
offsetCrossThreadData += sizeForLocalIdsProgramming;
|
||||||
|
|
||||||
auto ptrToPatchImplicitArgs = indirectHeap.getSpace(sizeForImplicitArgsProgramming);
|
auto ptrToPatchImplicitArgs = indirectHeap.getSpace(sizeForImplicitArgsProgramming);
|
||||||
|
|
||||||
ImplicitArgsHelper::patchImplicitArgs(ptrToPatchImplicitArgs, *pImplicitArgs, kernelDescriptor, std::make_pair(generationOfLocalIdsByRuntime, requiredWalkOrder), gfxCoreHelper);
|
ImplicitArgsHelper::patchImplicitArgs(ptrToPatchImplicitArgs, *pImplicitArgs, kernelDescriptor, std::make_pair(generationOfLocalIdsByRuntime, requiredWalkOrder), rootDeviceEnvironment);
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t sizeToCopy = sizeCrossThreadData;
|
uint32_t sizeToCopy = sizeCrossThreadData;
|
||||||
|
|
|
@ -2201,7 +2201,7 @@ void Kernel::reconfigureKernel() {
|
||||||
bool isLocalIdsGeneratedByHw = false; // if local ids generated by runtime then more work groups available
|
bool isLocalIdsGeneratedByHw = false; // if local ids generated by runtime then more work groups available
|
||||||
maxWorkGroupSize = static_cast<uint32_t>(kernelInfo.getMaxRequiredWorkGroupSize(maxWorkGroupSize));
|
maxWorkGroupSize = static_cast<uint32_t>(kernelInfo.getMaxRequiredWorkGroupSize(maxWorkGroupSize));
|
||||||
|
|
||||||
this->maxKernelWorkGroupSize = gfxCoreHelper.adjustMaxWorkGroupSize(kernelDescriptor.kernelAttributes.numGrfRequired, kernelDescriptor.kernelAttributes.simdSize, isLocalIdsGeneratedByHw, maxWorkGroupSize);
|
this->maxKernelWorkGroupSize = gfxCoreHelper.adjustMaxWorkGroupSize(kernelDescriptor.kernelAttributes.numGrfRequired, kernelDescriptor.kernelAttributes.simdSize, isLocalIdsGeneratedByHw, maxWorkGroupSize, getDevice().getRootDeviceEnvironment());
|
||||||
|
|
||||||
this->containsStatelessWrites = kernelDescriptor.kernelAttributes.flags.usesStatelessWrites;
|
this->containsStatelessWrites = kernelDescriptor.kernelAttributes.flags.usesStatelessWrites;
|
||||||
this->systolicPipelineSelectMode = kernelDescriptor.kernelAttributes.flags.usesSystolicPipelineSelectMode;
|
this->systolicPipelineSelectMode = kernelDescriptor.kernelAttributes.flags.usesSystolicPipelineSelectMode;
|
||||||
|
@ -2287,13 +2287,12 @@ void Kernel::initializeLocalIdsCache() {
|
||||||
|
|
||||||
void Kernel::setLocalIdsForGroup(const Vec3<uint16_t> &groupSize, void *destination) const {
|
void Kernel::setLocalIdsForGroup(const Vec3<uint16_t> &groupSize, void *destination) const {
|
||||||
UNRECOVERABLE_IF(localIdsCache.get() == nullptr);
|
UNRECOVERABLE_IF(localIdsCache.get() == nullptr);
|
||||||
const auto &gfxCoreHelper = this->getGfxCoreHelper();
|
localIdsCache->setLocalIdsForGroup(groupSize, destination, clDevice.getRootDeviceEnvironment());
|
||||||
localIdsCache->setLocalIdsForGroup(groupSize, destination, gfxCoreHelper);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t Kernel::getLocalIdsSizeForGroup(const Vec3<uint16_t> &groupSize) const {
|
size_t Kernel::getLocalIdsSizeForGroup(const Vec3<uint16_t> &groupSize) const {
|
||||||
UNRECOVERABLE_IF(localIdsCache.get() == nullptr);
|
UNRECOVERABLE_IF(localIdsCache.get() == nullptr);
|
||||||
return localIdsCache->getLocalIdsSizeForGroup(groupSize, getGfxCoreHelper());
|
return localIdsCache->getLocalIdsSizeForGroup(groupSize, clDevice.getRootDeviceEnvironment());
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t Kernel::getLocalIdsSizePerThread() const {
|
size_t Kernel::getLocalIdsSizePerThread() const {
|
||||||
|
|
|
@ -742,7 +742,7 @@ HWTEST_F(DispatchWalkerTest, GivenBlockedQueueWhenDispatchingWalkerThenRequiredH
|
||||||
walkerArgs);
|
walkerArgs);
|
||||||
|
|
||||||
auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(kernel);
|
auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(kernel);
|
||||||
auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(kernel, workGroupSize);
|
auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(kernel, workGroupSize, pClDevice->getRootDeviceEnvironment());
|
||||||
auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(kernel);
|
auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(kernel);
|
||||||
|
|
||||||
EXPECT_LE(expectedSizeDSH, blockedCommandsData->dsh->getMaxAvailableSpace());
|
EXPECT_LE(expectedSizeDSH, blockedCommandsData->dsh->getMaxAvailableSpace());
|
||||||
|
@ -1354,8 +1354,9 @@ HWTEST_F(DispatchWalkerTest, WhenKernelRequiresImplicitArgsThenIohRequiresMoreSp
|
||||||
multiDispatchInfoWithoutImplicitArgs,
|
multiDispatchInfoWithoutImplicitArgs,
|
||||||
CsrDependencies(),
|
CsrDependencies(),
|
||||||
walkerArgsWithoutImplicitArgs);
|
walkerArgsWithoutImplicitArgs);
|
||||||
|
const auto &rootDeviceEnvironment = pClDevice->getRootDeviceEnvironment();
|
||||||
|
|
||||||
auto iohSizeWithoutImplicitArgs = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(kernelWithoutImplicitArgs, workGroupSize);
|
auto iohSizeWithoutImplicitArgs = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(kernelWithoutImplicitArgs, workGroupSize, rootDeviceEnvironment);
|
||||||
|
|
||||||
DispatchInfo dispatchInfoWithImplicitArgs(pClDevice, const_cast<MockKernel *>(&kernelWithImplicitArgs), dimensions, workItems, workGroupSize, globalOffsets);
|
DispatchInfo dispatchInfoWithImplicitArgs(pClDevice, const_cast<MockKernel *>(&kernelWithImplicitArgs), dimensions, workItems, workGroupSize, globalOffsets);
|
||||||
dispatchInfoWithImplicitArgs.setNumberOfWorkgroups({1, 1, 1});
|
dispatchInfoWithImplicitArgs.setNumberOfWorkgroups({1, 1, 1});
|
||||||
|
@ -1370,7 +1371,7 @@ HWTEST_F(DispatchWalkerTest, WhenKernelRequiresImplicitArgsThenIohRequiresMoreSp
|
||||||
CsrDependencies(),
|
CsrDependencies(),
|
||||||
walkerArgsWithImplicitArgs);
|
walkerArgsWithImplicitArgs);
|
||||||
|
|
||||||
auto iohSizeWithImplicitArgs = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(kernelWithImplicitArgs, workGroupSize);
|
auto iohSizeWithImplicitArgs = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(kernelWithImplicitArgs, workGroupSize, rootDeviceEnvironment);
|
||||||
|
|
||||||
EXPECT_LE(iohSizeWithoutImplicitArgs, iohSizeWithImplicitArgs);
|
EXPECT_LE(iohSizeWithoutImplicitArgs, iohSizeWithImplicitArgs);
|
||||||
|
|
||||||
|
@ -1378,10 +1379,10 @@ HWTEST_F(DispatchWalkerTest, WhenKernelRequiresImplicitArgsThenIohRequiresMoreSp
|
||||||
auto numChannels = kernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels;
|
auto numChannels = kernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels;
|
||||||
auto simdSize = kernelInfo.getMaxSimdSize();
|
auto simdSize = kernelInfo.getMaxSimdSize();
|
||||||
uint32_t grfSize = sizeof(typename FamilyType::GRF);
|
uint32_t grfSize = sizeof(typename FamilyType::GRF);
|
||||||
const auto &gfxCoreHelper = getHelper<GfxCoreHelper>();
|
|
||||||
auto size = kernelWithImplicitArgs.getCrossThreadDataSize() +
|
auto size = kernelWithImplicitArgs.getCrossThreadDataSize() +
|
||||||
HardwareCommandsHelper<FamilyType>::getPerThreadDataSizeTotal(simdSize, grfSize, numChannels, Math::computeTotalElementsCount(workGroupSize), false, gfxCoreHelper) +
|
HardwareCommandsHelper<FamilyType>::getPerThreadDataSizeTotal(simdSize, grfSize, numChannels, Math::computeTotalElementsCount(workGroupSize), false, rootDeviceEnvironment) +
|
||||||
ImplicitArgsHelper::getSizeForImplicitArgsPatching(kernelWithImplicitArgs.getImplicitArgs(), kernelWithImplicitArgs.getDescriptor(), false, gfxCoreHelper);
|
ImplicitArgsHelper::getSizeForImplicitArgsPatching(kernelWithImplicitArgs.getImplicitArgs(), kernelWithImplicitArgs.getDescriptor(), false, rootDeviceEnvironment);
|
||||||
|
|
||||||
size = alignUp(size, MemoryConstants::cacheLineSize);
|
size = alignUp(size, MemoryConstants::cacheLineSize);
|
||||||
EXPECT_EQ(size, iohSizeWithImplicitArgs);
|
EXPECT_EQ(size, iohSizeWithImplicitArgs);
|
||||||
|
@ -1404,14 +1405,14 @@ HWTEST_F(DispatchWalkerTest, WhenKernelRequiresImplicitArgsAndLocalWorkSizeIsSet
|
||||||
dispatchInfoWithImplicitArgs.setNumberOfWorkgroups({1, 1, 1});
|
dispatchInfoWithImplicitArgs.setNumberOfWorkgroups({1, 1, 1});
|
||||||
dispatchInfoWithImplicitArgs.setTotalNumberOfWorkgroups({1, 1, 1});
|
dispatchInfoWithImplicitArgs.setTotalNumberOfWorkgroups({1, 1, 1});
|
||||||
|
|
||||||
auto iohSizeWithImplicitArgsWithoutLWS = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(kernelWithImplicitArgs, workGroupSize);
|
auto iohSizeWithImplicitArgsWithoutLWS = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(kernelWithImplicitArgs, workGroupSize, pClDevice->getRootDeviceEnvironment());
|
||||||
|
|
||||||
dispatchInfoWithImplicitArgs.setLWS({683, 1, 1});
|
dispatchInfoWithImplicitArgs.setLWS({683, 1, 1});
|
||||||
|
|
||||||
auto lws = dispatchInfoWithImplicitArgs.getLocalWorkgroupSize();
|
auto lws = dispatchInfoWithImplicitArgs.getLocalWorkgroupSize();
|
||||||
kernelWithImplicitArgs.setLocalWorkSizeValues(static_cast<uint32_t>(lws.x), static_cast<uint32_t>(lws.y), static_cast<uint32_t>(lws.z));
|
kernelWithImplicitArgs.setLocalWorkSizeValues(static_cast<uint32_t>(lws.x), static_cast<uint32_t>(lws.y), static_cast<uint32_t>(lws.z));
|
||||||
|
|
||||||
auto iohSizeWithImplicitArgsWithLWS = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(kernelWithImplicitArgs, workGroupSize);
|
auto iohSizeWithImplicitArgsWithLWS = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(kernelWithImplicitArgs, workGroupSize, pClDevice->getRootDeviceEnvironment());
|
||||||
|
|
||||||
EXPECT_LE(iohSizeWithImplicitArgsWithoutLWS, iohSizeWithImplicitArgsWithLWS);
|
EXPECT_LE(iohSizeWithImplicitArgsWithoutLWS, iohSizeWithImplicitArgsWithLWS);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (C) 2018-2023 Intel Corporation
|
* Copyright (C) 2018-2024 Intel Corporation
|
||||||
*
|
*
|
||||||
* SPDX-License-Identifier: MIT
|
* SPDX-License-Identifier: MIT
|
||||||
*
|
*
|
||||||
|
@ -500,7 +500,7 @@ HWTEST_F(GetSizeRequiredBufferTest, GivenHelloWorldKernelWhenEnqueingKernelThenH
|
||||||
auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, false, false, *pCmdQ, KernelFixture::pKernel, {});
|
auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, false, false, *pCmdQ, KernelFixture::pKernel, {});
|
||||||
auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*KernelFixture::pKernel);
|
auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*KernelFixture::pKernel);
|
||||||
size_t localWorkSizes[] = {64, 1, 1};
|
size_t localWorkSizes[] = {64, 1, 1};
|
||||||
auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(*KernelFixture::pKernel, localWorkSizes);
|
auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(*KernelFixture::pKernel, localWorkSizes, pClDevice->getRootDeviceEnvironment());
|
||||||
auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*KernelFixture::pKernel);
|
auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*KernelFixture::pKernel);
|
||||||
|
|
||||||
// Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended.
|
// Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended.
|
||||||
|
@ -540,7 +540,7 @@ HWTEST_F(GetSizeRequiredBufferTest, GivenKernelWithSimpleArgWhenEnqueingKernelTh
|
||||||
auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, false, false, *pCmdQ, KernelFixture::pKernel, {});
|
auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, false, false, *pCmdQ, KernelFixture::pKernel, {});
|
||||||
auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*KernelFixture::pKernel);
|
auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*KernelFixture::pKernel);
|
||||||
size_t localWorkSizes[] = {64, 1, 1};
|
size_t localWorkSizes[] = {64, 1, 1};
|
||||||
auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(*KernelFixture::pKernel, localWorkSizes);
|
auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(*KernelFixture::pKernel, localWorkSizes, pClDevice->getRootDeviceEnvironment());
|
||||||
auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*KernelFixture::pKernel);
|
auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*KernelFixture::pKernel);
|
||||||
|
|
||||||
EXPECT_EQ(0u, expectedSizeIOH % GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
|
EXPECT_EQ(0u, expectedSizeIOH % GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (C) 2018-2023 Intel Corporation
|
* Copyright (C) 2018-2024 Intel Corporation
|
||||||
*
|
*
|
||||||
* SPDX-License-Identifier: MIT
|
* SPDX-License-Identifier: MIT
|
||||||
*
|
*
|
||||||
|
@ -98,7 +98,7 @@ HWTEST_F(GetSizeRequiredImageTest, WhenCopyingImageThenHeapsAndCommandBufferCons
|
||||||
auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_COPY_IMAGE, false, false, *pCmdQ, kernel, {});
|
auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_COPY_IMAGE, false, false, *pCmdQ, kernel, {});
|
||||||
auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*kernel);
|
auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*kernel);
|
||||||
size_t localWorkSizes[] = {256, 1, 1};
|
size_t localWorkSizes[] = {256, 1, 1};
|
||||||
auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(*kernel, localWorkSizes);
|
auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(*kernel, localWorkSizes, pDevice->getRootDeviceEnvironment());
|
||||||
auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel);
|
auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel);
|
||||||
|
|
||||||
// Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended.
|
// Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended.
|
||||||
|
@ -146,7 +146,7 @@ HWTEST_F(GetSizeRequiredImageTest, WhenCopyingReadWriteImageThenHeapsAndCommandB
|
||||||
auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_COPY_IMAGE, false, false, *pCmdQ, kernel.get(), {});
|
auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_COPY_IMAGE, false, false, *pCmdQ, kernel.get(), {});
|
||||||
auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*kernel.get());
|
auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*kernel.get());
|
||||||
size_t localWorkSizes[] = {256, 1, 1};
|
size_t localWorkSizes[] = {256, 1, 1};
|
||||||
auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(*kernel.get(), localWorkSizes);
|
auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(*kernel.get(), localWorkSizes, pDevice->getRootDeviceEnvironment());
|
||||||
auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel.get());
|
auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel.get());
|
||||||
|
|
||||||
// Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended.
|
// Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended.
|
||||||
|
@ -204,7 +204,7 @@ HWTEST_F(GetSizeRequiredImageTest, WhenReadingImageNonBlockingThenHeapsAndComman
|
||||||
auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_READ_IMAGE, false, false, *pCmdQ, kernel, {});
|
auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_READ_IMAGE, false, false, *pCmdQ, kernel, {});
|
||||||
auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*kernel);
|
auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*kernel);
|
||||||
size_t localWorkSizes[] = {256, 1, 1};
|
size_t localWorkSizes[] = {256, 1, 1};
|
||||||
auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(*kernel, localWorkSizes);
|
auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(*kernel, localWorkSizes, pDevice->getRootDeviceEnvironment());
|
||||||
auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel);
|
auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel);
|
||||||
|
|
||||||
// Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended.
|
// Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended.
|
||||||
|
@ -260,7 +260,7 @@ HWTEST_F(GetSizeRequiredImageTest, WhenReadingImageBlockingThenHeapsAndCommandBu
|
||||||
auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_READ_IMAGE, false, false, *pCmdQ, kernel, {});
|
auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_READ_IMAGE, false, false, *pCmdQ, kernel, {});
|
||||||
auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*kernel);
|
auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*kernel);
|
||||||
size_t localWorkSizes[] = {256, 1, 1};
|
size_t localWorkSizes[] = {256, 1, 1};
|
||||||
auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(*kernel, localWorkSizes);
|
auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(*kernel, localWorkSizes, pDevice->getRootDeviceEnvironment());
|
||||||
auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel);
|
auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel);
|
||||||
|
|
||||||
// Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended.
|
// Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended.
|
||||||
|
@ -316,7 +316,7 @@ HWTEST_F(GetSizeRequiredImageTest, WhenWritingImageNonBlockingThenHeapsAndComman
|
||||||
auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_WRITE_IMAGE, false, false, *pCmdQ, kernel, {});
|
auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_WRITE_IMAGE, false, false, *pCmdQ, kernel, {});
|
||||||
auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*kernel);
|
auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*kernel);
|
||||||
size_t localWorkSizes[] = {256, 1, 1};
|
size_t localWorkSizes[] = {256, 1, 1};
|
||||||
auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(*kernel, localWorkSizes);
|
auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(*kernel, localWorkSizes, pDevice->getRootDeviceEnvironment());
|
||||||
auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel);
|
auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel);
|
||||||
|
|
||||||
// Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended.
|
// Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended.
|
||||||
|
@ -372,7 +372,7 @@ HWTEST_F(GetSizeRequiredImageTest, WhenWritingImageBlockingThenHeapsAndCommandBu
|
||||||
auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_WRITE_IMAGE, false, false, *pCmdQ, kernel, {});
|
auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_WRITE_IMAGE, false, false, *pCmdQ, kernel, {});
|
||||||
auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*kernel);
|
auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*kernel);
|
||||||
size_t localWorkSizes[] = {256, 1, 1};
|
size_t localWorkSizes[] = {256, 1, 1};
|
||||||
auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(*kernel, localWorkSizes);
|
auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(*kernel, localWorkSizes, pDevice->getRootDeviceEnvironment());
|
||||||
auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel);
|
auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel);
|
||||||
|
|
||||||
// Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended.
|
// Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended.
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (C) 2019-2023 Intel Corporation
|
* Copyright (C) 2019-2024 Intel Corporation
|
||||||
*
|
*
|
||||||
* SPDX-License-Identifier: MIT
|
* SPDX-License-Identifier: MIT
|
||||||
*
|
*
|
||||||
|
@ -171,7 +171,9 @@ HWTEST_F(HardwareCommandsTest, WhenCrossThreadDataIsCreatedThenOnlyRequiredSpace
|
||||||
*kernel,
|
*kernel,
|
||||||
false,
|
false,
|
||||||
nullptr,
|
nullptr,
|
||||||
sizeCrossThreadData, 0);
|
sizeCrossThreadData,
|
||||||
|
0,
|
||||||
|
pClDevice->getRootDeviceEnvironment());
|
||||||
|
|
||||||
auto usedAfter = indirectHeap.getUsed();
|
auto usedAfter = indirectHeap.getUsed();
|
||||||
EXPECT_EQ(kernel->getCrossThreadDataSize(), usedAfter - usedBefore);
|
EXPECT_EQ(kernel->getCrossThreadDataSize(), usedAfter - usedBefore);
|
||||||
|
@ -199,7 +201,8 @@ HWTEST_F(HardwareCommandsTest, givenSendCrossThreadDataWhenWhenAddPatchInfoComme
|
||||||
false,
|
false,
|
||||||
nullptr,
|
nullptr,
|
||||||
sizeCrossThreadData,
|
sizeCrossThreadData,
|
||||||
0);
|
0,
|
||||||
|
pClDevice->getRootDeviceEnvironment());
|
||||||
|
|
||||||
ASSERT_EQ(1u, kernel->getPatchInfoDataList().size());
|
ASSERT_EQ(1u, kernel->getPatchInfoDataList().size());
|
||||||
EXPECT_EQ(0xaaaaaaaa, kernel->getPatchInfoDataList()[0].sourceAllocation);
|
EXPECT_EQ(0xaaaaaaaa, kernel->getPatchInfoDataList()[0].sourceAllocation);
|
||||||
|
@ -222,7 +225,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenIndirectHeapNotAllocatedF
|
||||||
false,
|
false,
|
||||||
nullptr,
|
nullptr,
|
||||||
sizeCrossThreadData,
|
sizeCrossThreadData,
|
||||||
0);
|
0,
|
||||||
|
pClDevice->getRootDeviceEnvironment());
|
||||||
EXPECT_EQ(0u, offset);
|
EXPECT_EQ(0u, offset);
|
||||||
pDevice->getMemoryManager()->freeGraphicsMemory(nonInternalAllocation);
|
pDevice->getMemoryManager()->freeGraphicsMemory(nonInternalAllocation);
|
||||||
}
|
}
|
||||||
|
@ -240,7 +244,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenIndirectHeapAllocatedFrom
|
||||||
false,
|
false,
|
||||||
nullptr,
|
nullptr,
|
||||||
sizeCrossThreadData,
|
sizeCrossThreadData,
|
||||||
0);
|
0,
|
||||||
|
pClDevice->getRootDeviceEnvironment());
|
||||||
EXPECT_EQ(expectedOffset, offset);
|
EXPECT_EQ(expectedOffset, offset);
|
||||||
|
|
||||||
pDevice->getMemoryManager()->freeGraphicsMemory(internalAllocation);
|
pDevice->getMemoryManager()->freeGraphicsMemory(internalAllocation);
|
||||||
|
@ -275,7 +280,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenSendCrossThreadDataWhenWh
|
||||||
false,
|
false,
|
||||||
nullptr,
|
nullptr,
|
||||||
sizeCrossThreadData,
|
sizeCrossThreadData,
|
||||||
0);
|
0,
|
||||||
|
pClDevice->getRootDeviceEnvironment());
|
||||||
|
|
||||||
ASSERT_NE(0u, offsetCrossThreadData);
|
ASSERT_NE(0u, offsetCrossThreadData);
|
||||||
EXPECT_EQ(128u, offsetCrossThreadData);
|
EXPECT_EQ(128u, offsetCrossThreadData);
|
||||||
|
@ -373,7 +379,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, WhenAllocatingIndirectStateRes
|
||||||
auto usedAfterIOH = ioh.getUsed();
|
auto usedAfterIOH = ioh.getUsed();
|
||||||
auto usedAfterSSH = ssh.getUsed();
|
auto usedAfterSSH = ssh.getUsed();
|
||||||
auto sizeRequiredDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*kernel);
|
auto sizeRequiredDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*kernel);
|
||||||
auto sizeRequiredIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(*kernel, localWorkSizes);
|
auto sizeRequiredIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(*kernel, localWorkSizes, pDevice->getRootDeviceEnvironment());
|
||||||
auto sizeRequiredSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel);
|
auto sizeRequiredSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel);
|
||||||
|
|
||||||
EXPECT_GE(sizeRequiredDSH, usedAfterDSH - usedBeforeDSH);
|
EXPECT_GE(sizeRequiredDSH, usedAfterDSH - usedBeforeDSH);
|
||||||
|
@ -559,8 +565,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, whenSendingIndirectStateThenKe
|
||||||
constexpr uint32_t grfSize = sizeof(typename FamilyType::GRF);
|
constexpr uint32_t grfSize = sizeof(typename FamilyType::GRF);
|
||||||
size_t localWorkSize = localWorkSizeX * localWorkSizeY * localWorkSizeZ;
|
size_t localWorkSize = localWorkSizeX * localWorkSizeY * localWorkSizeZ;
|
||||||
auto numChannels = modifiedKernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels;
|
auto numChannels = modifiedKernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels;
|
||||||
const auto &gfxCoreHelper = pDevice->getGfxCoreHelper();
|
const auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment();
|
||||||
size_t expectedIohSize = PerThreadDataHelper::getPerThreadDataSizeTotal(modifiedKernelInfo.getMaxSimdSize(), grfSize, numChannels, localWorkSize, !kernelUsesLocalIds, gfxCoreHelper);
|
size_t expectedIohSize = PerThreadDataHelper::getPerThreadDataSizeTotal(modifiedKernelInfo.getMaxSimdSize(), grfSize, numChannels, localWorkSize, !kernelUsesLocalIds, rootDeviceEnvironment);
|
||||||
ASSERT_LE(expectedIohSize, ioh.getUsed());
|
ASSERT_LE(expectedIohSize, ioh.getUsed());
|
||||||
|
|
||||||
auto expectedLocalIds = alignedMalloc(expectedIohSize, 64);
|
auto expectedLocalIds = alignedMalloc(expectedIohSize, 64);
|
||||||
|
@ -569,7 +575,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, whenSendingIndirectStateThenKe
|
||||||
std::array<uint8_t, 3>{{modifiedKernelInfo.kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[0],
|
std::array<uint8_t, 3>{{modifiedKernelInfo.kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[0],
|
||||||
modifiedKernelInfo.kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[1],
|
modifiedKernelInfo.kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[1],
|
||||||
modifiedKernelInfo.kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[2]}},
|
modifiedKernelInfo.kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[2]}},
|
||||||
false, grfSize, gfxCoreHelper);
|
false, grfSize, rootDeviceEnvironment);
|
||||||
|
|
||||||
EXPECT_EQ(0, memcmp(expectedLocalIds, ioh.getCpuBase(), expectedIohSize));
|
EXPECT_EQ(0, memcmp(expectedLocalIds, ioh.getCpuBase(), expectedIohSize));
|
||||||
alignedFree(expectedLocalIds);
|
alignedFree(expectedLocalIds);
|
||||||
|
@ -1153,8 +1159,8 @@ struct HardwareCommandsImplicitArgsTests : Test<ClDeviceFixture> {
|
||||||
kernel.setGlobalWorkSizeValues(static_cast<uint32_t>(expectedImplicitArgs.globalSizeX), static_cast<uint32_t>(expectedImplicitArgs.globalSizeY), static_cast<uint32_t>(expectedImplicitArgs.globalSizeZ));
|
kernel.setGlobalWorkSizeValues(static_cast<uint32_t>(expectedImplicitArgs.globalSizeX), static_cast<uint32_t>(expectedImplicitArgs.globalSizeY), static_cast<uint32_t>(expectedImplicitArgs.globalSizeZ));
|
||||||
kernel.setGlobalWorkOffsetValues(static_cast<uint32_t>(expectedImplicitArgs.globalOffsetX), static_cast<uint32_t>(expectedImplicitArgs.globalOffsetY), static_cast<uint32_t>(expectedImplicitArgs.globalOffsetZ));
|
kernel.setGlobalWorkOffsetValues(static_cast<uint32_t>(expectedImplicitArgs.globalOffsetX), static_cast<uint32_t>(expectedImplicitArgs.globalOffsetY), static_cast<uint32_t>(expectedImplicitArgs.globalOffsetZ));
|
||||||
kernel.setNumWorkGroupsValues(expectedImplicitArgs.groupCountX, expectedImplicitArgs.groupCountY, expectedImplicitArgs.groupCountZ);
|
kernel.setNumWorkGroupsValues(expectedImplicitArgs.groupCountX, expectedImplicitArgs.groupCountY, expectedImplicitArgs.groupCountZ);
|
||||||
const auto &gfxCoreHelper = pDevice->getGfxCoreHelper();
|
const auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment();
|
||||||
implicitArgsProgrammingSize = ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernel.getDescriptor(), false, gfxCoreHelper);
|
implicitArgsProgrammingSize = ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernel.getDescriptor(), false, rootDeviceEnvironment);
|
||||||
|
|
||||||
auto sizeCrossThreadData = kernel.getCrossThreadDataSize();
|
auto sizeCrossThreadData = kernel.getCrossThreadDataSize();
|
||||||
using DefaultWalkerType = typename FamilyType::DefaultWalkerType;
|
using DefaultWalkerType = typename FamilyType::DefaultWalkerType;
|
||||||
|
@ -1164,7 +1170,8 @@ struct HardwareCommandsImplicitArgsTests : Test<ClDeviceFixture> {
|
||||||
false,
|
false,
|
||||||
nullptr,
|
nullptr,
|
||||||
sizeCrossThreadData,
|
sizeCrossThreadData,
|
||||||
0);
|
0,
|
||||||
|
pClDevice->getRootDeviceEnvironment());
|
||||||
|
|
||||||
EXPECT_LE(implicitArgsProgrammingSize, indirectHeap.getUsed());
|
EXPECT_LE(implicitArgsProgrammingSize, indirectHeap.getUsed());
|
||||||
|
|
||||||
|
@ -1218,11 +1225,11 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithI
|
||||||
|
|
||||||
auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth);
|
auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth);
|
||||||
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgs::getSize(), MemoryConstants::cacheLineSize);
|
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgs::getSize(), MemoryConstants::cacheLineSize);
|
||||||
const auto &gfxCoreHelper = pDevice->getGfxCoreHelper();
|
const auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment();
|
||||||
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize, gfxCoreHelper);
|
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize, rootDeviceEnvironment);
|
||||||
|
|
||||||
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgs::getSize();
|
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgs::getSize();
|
||||||
size_t sizeForLocalIds = PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize, false, gfxCoreHelper);
|
size_t sizeForLocalIds = PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize, false, rootDeviceEnvironment);
|
||||||
|
|
||||||
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
|
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
|
||||||
alignedFree(expectedLocalIds);
|
alignedFree(expectedLocalIds);
|
||||||
|
@ -1252,11 +1259,11 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithI
|
||||||
|
|
||||||
auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth);
|
auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth);
|
||||||
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgs::getSize(), MemoryConstants::cacheLineSize);
|
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgs::getSize(), MemoryConstants::cacheLineSize);
|
||||||
const auto &gfxCoreHelper = pDevice->getGfxCoreHelper();
|
const auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment();
|
||||||
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize, gfxCoreHelper);
|
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize, rootDeviceEnvironment);
|
||||||
|
|
||||||
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgs::getSize();
|
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgs::getSize();
|
||||||
size_t sizeForLocalIds = PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize, false, gfxCoreHelper);
|
size_t sizeForLocalIds = PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize, false, rootDeviceEnvironment);
|
||||||
|
|
||||||
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
|
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
|
||||||
alignedFree(expectedLocalIds);
|
alignedFree(expectedLocalIds);
|
||||||
|
@ -1308,7 +1315,8 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsTestXeHpAndLater, givenIndirectHeap
|
||||||
false,
|
false,
|
||||||
nullptr,
|
nullptr,
|
||||||
sizeCrossThreadData,
|
sizeCrossThreadData,
|
||||||
0);
|
0,
|
||||||
|
pClDevice->getRootDeviceEnvironment());
|
||||||
EXPECT_EQ(expectedOffset, offset);
|
EXPECT_EQ(expectedOffset, offset);
|
||||||
pDevice->getMemoryManager()->freeGraphicsMemory(nonInternalAllocation);
|
pDevice->getMemoryManager()->freeGraphicsMemory(nonInternalAllocation);
|
||||||
}
|
}
|
||||||
|
@ -1326,7 +1334,8 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsTestXeHpAndLater, givenIndirectHeap
|
||||||
false,
|
false,
|
||||||
nullptr,
|
nullptr,
|
||||||
sizeCrossThreadData,
|
sizeCrossThreadData,
|
||||||
0);
|
0,
|
||||||
|
pClDevice->getRootDeviceEnvironment());
|
||||||
EXPECT_EQ(expectedOffset, offset);
|
EXPECT_EQ(expectedOffset, offset);
|
||||||
|
|
||||||
pDevice->getMemoryManager()->freeGraphicsMemory(internalAllocation);
|
pDevice->getMemoryManager()->freeGraphicsMemory(internalAllocation);
|
||||||
|
@ -1362,7 +1371,8 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsTestXeHpAndLater, givenSendCrossThr
|
||||||
false,
|
false,
|
||||||
nullptr,
|
nullptr,
|
||||||
sizeCrossThreadData,
|
sizeCrossThreadData,
|
||||||
0);
|
0,
|
||||||
|
pClDevice->getRootDeviceEnvironment());
|
||||||
|
|
||||||
auto expectedOffsetRelativeToIohBase = 128u;
|
auto expectedOffsetRelativeToIohBase = 128u;
|
||||||
auto iohBaseAddress = is64bit ? 0u : indirectHeap.getHeapGpuBase();
|
auto iohBaseAddress = is64bit ? 0u : indirectHeap.getHeapGpuBase();
|
||||||
|
|
|
@ -181,7 +181,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
||||||
|
|
||||||
uint32_t sizeThreadData = sizePerThreadDataForWholeGroup + sizeCrossThreadData;
|
uint32_t sizeThreadData = sizePerThreadDataForWholeGroup + sizeCrossThreadData;
|
||||||
bool isHwLocalIdGeneration = false;
|
bool isHwLocalIdGeneration = false;
|
||||||
uint32_t sizeForImplicitArgsPatching = NEO::ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, isHwLocalIdGeneration, gfxCoreHelper);
|
uint32_t sizeForImplicitArgsPatching = NEO::ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, isHwLocalIdGeneration, rootDeviceEnvironment);
|
||||||
uint32_t iohRequiredSize = sizeThreadData + sizeForImplicitArgsPatching;
|
uint32_t iohRequiredSize = sizeThreadData + sizeForImplicitArgsPatching;
|
||||||
uint64_t offsetThreadData = 0u;
|
uint64_t offsetThreadData = 0u;
|
||||||
{
|
{
|
||||||
|
@ -203,7 +203,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
||||||
auto implicitArgsCrossThreadPtr = ptrOffset(const_cast<uint64_t *>(reinterpret_cast<const uint64_t *>(args.dispatchInterface->getCrossThreadData())), kernelDescriptor.payloadMappings.implicitArgs.implicitArgsBuffer);
|
auto implicitArgsCrossThreadPtr = ptrOffset(const_cast<uint64_t *>(reinterpret_cast<const uint64_t *>(args.dispatchInterface->getCrossThreadData())), kernelDescriptor.payloadMappings.implicitArgs.implicitArgsBuffer);
|
||||||
*implicitArgsCrossThreadPtr = implicitArgsGpuVA;
|
*implicitArgsCrossThreadPtr = implicitArgsGpuVA;
|
||||||
|
|
||||||
ptr = NEO::ImplicitArgsHelper::patchImplicitArgs(ptr, *pImplicitArgs, kernelDescriptor, {}, gfxCoreHelper);
|
ptr = NEO::ImplicitArgsHelper::patchImplicitArgs(ptr, *pImplicitArgs, kernelDescriptor, {}, rootDeviceEnvironment);
|
||||||
}
|
}
|
||||||
|
|
||||||
memcpy_s(ptr, sizeCrossThreadData,
|
memcpy_s(ptr, sizeCrossThreadData,
|
||||||
|
|
|
@ -240,7 +240,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t sizeThreadData = sizePerThreadDataForWholeGroup + sizeCrossThreadData;
|
uint32_t sizeThreadData = sizePerThreadDataForWholeGroup + sizeCrossThreadData;
|
||||||
uint32_t sizeForImplicitArgsPatching = NEO::ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, !localIdsGenerationByRuntime, gfxCoreHelper);
|
uint32_t sizeForImplicitArgsPatching = NEO::ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, !localIdsGenerationByRuntime, rootDeviceEnvironment);
|
||||||
uint32_t iohRequiredSize = sizeThreadData + sizeForImplicitArgsPatching;
|
uint32_t iohRequiredSize = sizeThreadData + sizeForImplicitArgsPatching;
|
||||||
{
|
{
|
||||||
auto heap = container.getIndirectHeap(HeapType::indirectObject);
|
auto heap = container.getIndirectHeap(HeapType::indirectObject);
|
||||||
|
@ -254,11 +254,11 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
||||||
}
|
}
|
||||||
UNRECOVERABLE_IF(!ptr);
|
UNRECOVERABLE_IF(!ptr);
|
||||||
offsetThreadData = (is64bit ? heap->getHeapGpuStartOffset() : heap->getHeapGpuBase()) + static_cast<uint64_t>(heap->getUsed() - sizeThreadData);
|
offsetThreadData = (is64bit ? heap->getHeapGpuStartOffset() : heap->getHeapGpuBase()) + static_cast<uint64_t>(heap->getUsed() - sizeThreadData);
|
||||||
|
auto &rootDeviceEnvironment = args.device->getRootDeviceEnvironment();
|
||||||
if (pImplicitArgs) {
|
if (pImplicitArgs) {
|
||||||
offsetThreadData -= ImplicitArgs::getSize();
|
offsetThreadData -= ImplicitArgs::getSize();
|
||||||
pImplicitArgs->localIdTablePtr = heap->getGraphicsAllocation()->getGpuAddress() + heap->getUsed() - iohRequiredSize;
|
pImplicitArgs->localIdTablePtr = heap->getGraphicsAllocation()->getGpuAddress() + heap->getUsed() - iohRequiredSize;
|
||||||
ptr = NEO::ImplicitArgsHelper::patchImplicitArgs(ptr, *pImplicitArgs, kernelDescriptor, std::make_pair(localIdsGenerationByRuntime, requiredWorkgroupOrder), gfxCoreHelper);
|
ptr = NEO::ImplicitArgsHelper::patchImplicitArgs(ptr, *pImplicitArgs, kernelDescriptor, std::make_pair(localIdsGenerationByRuntime, requiredWorkgroupOrder), rootDeviceEnvironment);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (sizeCrossThreadData > 0) {
|
if (sizeCrossThreadData > 0) {
|
||||||
|
@ -313,7 +313,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
||||||
}
|
}
|
||||||
|
|
||||||
if (NEO::PauseOnGpuProperties::pauseModeAllowed(NEO::debugManager.flags.PauseOnEnqueue.get(), args.device->debugExecutionCounter.load(), NEO::PauseOnGpuProperties::PauseMode::BeforeWorkload)) {
|
if (NEO::PauseOnGpuProperties::pauseModeAllowed(NEO::debugManager.flags.PauseOnEnqueue.get(), args.device->debugExecutionCounter.load(), NEO::PauseOnGpuProperties::PauseMode::BeforeWorkload)) {
|
||||||
void *commandBuffer = listCmdBufferStream->getSpace(MemorySynchronizationCommands<Family>::getSizeForBarrierWithPostSyncOperation(args.device->getRootDeviceEnvironment(), false));
|
void *commandBuffer = listCmdBufferStream->getSpace(MemorySynchronizationCommands<Family>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment, false));
|
||||||
args.additionalCommands->push_back(commandBuffer);
|
args.additionalCommands->push_back(commandBuffer);
|
||||||
|
|
||||||
EncodeSemaphore<Family>::applyMiSemaphoreWaitCommand(*listCmdBufferStream, *args.additionalCommands);
|
EncodeSemaphore<Family>::applyMiSemaphoreWaitCommand(*listCmdBufferStream, *args.additionalCommands);
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (C) 2018-2023 Intel Corporation
|
* Copyright (C) 2018-2024 Intel Corporation
|
||||||
*
|
*
|
||||||
* SPDX-License-Identifier: MIT
|
* SPDX-License-Identifier: MIT
|
||||||
*
|
*
|
||||||
|
@ -7,6 +7,7 @@
|
||||||
|
|
||||||
#include "shared/source/helpers/local_id_gen.h"
|
#include "shared/source/helpers/local_id_gen.h"
|
||||||
|
|
||||||
|
#include "shared/source/execution_environment/root_device_environment.h"
|
||||||
#include "shared/source/helpers/aligned_memory.h"
|
#include "shared/source/helpers/aligned_memory.h"
|
||||||
#include "shared/source/helpers/gfx_core_helper.h"
|
#include "shared/source/helpers/gfx_core_helper.h"
|
||||||
#include "shared/source/helpers/local_id_gen_special.inl"
|
#include "shared/source/helpers/local_id_gen_special.inl"
|
||||||
|
@ -42,9 +43,10 @@ LocalIDHelper::LocalIDHelper() {
|
||||||
|
|
||||||
LocalIDHelper LocalIDHelper::initializer;
|
LocalIDHelper LocalIDHelper::initializer;
|
||||||
|
|
||||||
void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize, const GfxCoreHelper &gfxCoreHelper) {
|
void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize, const RootDeviceEnvironment &rootDeviceEnvironment) {
|
||||||
bool localIdsGeneratedByHw = false;
|
bool localIdsGeneratedByHw = false;
|
||||||
auto threadsPerWorkGroup = static_cast<uint16_t>(gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]), grfSize, localIdsGeneratedByHw));
|
auto &gfxCoreHelper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
|
||||||
|
auto threadsPerWorkGroup = static_cast<uint16_t>(gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]), grfSize, localIdsGeneratedByHw, rootDeviceEnvironment));
|
||||||
bool useLayoutForImages = isImageOnlyKernel && isCompatibleWithLayoutForImages(localWorkgroupSize, dimensionsOrder, simd);
|
bool useLayoutForImages = isImageOnlyKernel && isCompatibleWithLayoutForImages(localWorkgroupSize, dimensionsOrder, simd);
|
||||||
if (useLayoutForImages) {
|
if (useLayoutForImages) {
|
||||||
generateLocalIDsWithLayoutForImages(buffer, localWorkgroupSize, simd);
|
generateLocalIDsWithLayoutForImages(buffer, localWorkgroupSize, simd);
|
||||||
|
|
|
@ -125,7 +125,7 @@ class GfxCoreHelper {
|
||||||
virtual bool isCooperativeDispatchSupported(const EngineGroupType engineGroupType, const RootDeviceEnvironment &rootDeviceEnvironment) const = 0;
|
virtual bool isCooperativeDispatchSupported(const EngineGroupType engineGroupType, const RootDeviceEnvironment &rootDeviceEnvironment) const = 0;
|
||||||
virtual uint32_t adjustMaxWorkGroupCount(uint32_t maxWorkGroupCount, const EngineGroupType engineGroupType,
|
virtual uint32_t adjustMaxWorkGroupCount(uint32_t maxWorkGroupCount, const EngineGroupType engineGroupType,
|
||||||
const RootDeviceEnvironment &rootDeviceEnvironment, bool isEngineInstanced) const = 0;
|
const RootDeviceEnvironment &rootDeviceEnvironment, bool isEngineInstanced) const = 0;
|
||||||
virtual uint32_t adjustMaxWorkGroupSize(const uint32_t numGrf, const uint32_t simd, bool isHwLocalGeneration, const uint32_t defaultMaxGroupSize) const = 0;
|
virtual uint32_t adjustMaxWorkGroupSize(const uint32_t numGrf, const uint32_t simd, bool isHwLocalGeneration, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const = 0;
|
||||||
virtual size_t getMaxFillPaternSizeForCopyEngine() const = 0;
|
virtual size_t getMaxFillPaternSizeForCopyEngine() const = 0;
|
||||||
virtual size_t getSipKernelMaxDbgSurfaceSize(const HardwareInfo &hwInfo) const = 0;
|
virtual size_t getSipKernelMaxDbgSurfaceSize(const HardwareInfo &hwInfo) const = 0;
|
||||||
virtual bool isCpuImageTransferPreferred(const HardwareInfo &hwInfo) const = 0;
|
virtual bool isCpuImageTransferPreferred(const HardwareInfo &hwInfo) const = 0;
|
||||||
|
@ -174,7 +174,7 @@ class GfxCoreHelper {
|
||||||
virtual bool isChipsetUniqueUUIDSupported() const = 0;
|
virtual bool isChipsetUniqueUUIDSupported() const = 0;
|
||||||
virtual bool isTimestampShiftRequired() const = 0;
|
virtual bool isTimestampShiftRequired() const = 0;
|
||||||
virtual bool isRelaxedOrderingSupported() const = 0;
|
virtual bool isRelaxedOrderingSupported() const = 0;
|
||||||
virtual uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalIdGeneration) const = 0;
|
virtual uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalIdGeneration, const RootDeviceEnvironment &rootDeviceEnvironment) const = 0;
|
||||||
virtual uint32_t overrideMaxWorkGroupSize(uint32_t maxWG) const = 0;
|
virtual uint32_t overrideMaxWorkGroupSize(uint32_t maxWG) const = 0;
|
||||||
virtual char const *getDefaultDeviceHierarchy() const = 0;
|
virtual char const *getDefaultDeviceHierarchy() const = 0;
|
||||||
static bool isWorkaroundRequired(uint32_t lowestSteppingWithBug, uint32_t steppingWithFix, const HardwareInfo &hwInfo, const ProductHelper &productHelper);
|
static bool isWorkaroundRequired(uint32_t lowestSteppingWithBug, uint32_t steppingWithFix, const HardwareInfo &hwInfo, const ProductHelper &productHelper);
|
||||||
|
@ -341,7 +341,7 @@ class GfxCoreHelperHw : public GfxCoreHelper {
|
||||||
uint32_t adjustMaxWorkGroupCount(uint32_t maxWorkGroupCount, const EngineGroupType engineGroupType,
|
uint32_t adjustMaxWorkGroupCount(uint32_t maxWorkGroupCount, const EngineGroupType engineGroupType,
|
||||||
const RootDeviceEnvironment &rootDeviceEnvironment, bool isEngineInstanced) const override;
|
const RootDeviceEnvironment &rootDeviceEnvironment, bool isEngineInstanced) const override;
|
||||||
|
|
||||||
uint32_t adjustMaxWorkGroupSize(const uint32_t numGrf, const uint32_t simd, bool isHwLocalGeneration, const uint32_t defaultMaxGroupSize) const override;
|
uint32_t adjustMaxWorkGroupSize(const uint32_t numGrf, const uint32_t simd, bool isHwLocalGeneration, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const override;
|
||||||
size_t getMaxFillPaternSizeForCopyEngine() const override;
|
size_t getMaxFillPaternSizeForCopyEngine() const override;
|
||||||
|
|
||||||
size_t getSipKernelMaxDbgSurfaceSize(const HardwareInfo &hwInfo) const override;
|
size_t getSipKernelMaxDbgSurfaceSize(const HardwareInfo &hwInfo) const override;
|
||||||
|
@ -401,7 +401,7 @@ class GfxCoreHelperHw : public GfxCoreHelper {
|
||||||
bool isChipsetUniqueUUIDSupported() const override;
|
bool isChipsetUniqueUUIDSupported() const override;
|
||||||
bool isTimestampShiftRequired() const override;
|
bool isTimestampShiftRequired() const override;
|
||||||
bool isRelaxedOrderingSupported() const override;
|
bool isRelaxedOrderingSupported() const override;
|
||||||
uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalIdGeneration) const override;
|
uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalIdGeneration, const RootDeviceEnvironment &rootDeviceEnvironment) const override;
|
||||||
uint32_t overrideMaxWorkGroupSize(uint32_t maxWG) const override;
|
uint32_t overrideMaxWorkGroupSize(uint32_t maxWG) const override;
|
||||||
char const *getDefaultDeviceHierarchy() const override;
|
char const *getDefaultDeviceHierarchy() const override;
|
||||||
|
|
||||||
|
|
|
@ -707,7 +707,7 @@ uint32_t GfxCoreHelperHw<GfxFamily>::overrideMaxWorkGroupSize(uint32_t maxWG) co
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename GfxFamily>
|
template <typename GfxFamily>
|
||||||
uint32_t GfxCoreHelperHw<GfxFamily>::adjustMaxWorkGroupSize(const uint32_t numGrf, const uint32_t simd, bool isHwLocalGeneration, const uint32_t defaultMaxGroupSize) const {
|
uint32_t GfxCoreHelperHw<GfxFamily>::adjustMaxWorkGroupSize(const uint32_t numGrf, const uint32_t simd, bool isHwLocalGeneration, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const {
|
||||||
return defaultMaxGroupSize;
|
return defaultMaxGroupSize;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -717,7 +717,7 @@ uint32_t GfxCoreHelperHw<GfxFamily>::getMinimalGrfSize() const {
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename GfxFamily>
|
template <typename GfxFamily>
|
||||||
uint32_t GfxCoreHelperHw<GfxFamily>::calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalIdGeneration) const {
|
uint32_t GfxCoreHelperHw<GfxFamily>::calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalIdGeneration, const RootDeviceEnvironment &rootDeviceEnvironment) const {
|
||||||
return getThreadsPerWG(simd, totalWorkItems);
|
return getThreadsPerWG(simd, totalWorkItems);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (C) 2018-2023 Intel Corporation
|
* Copyright (C) 2018-2024 Intel Corporation
|
||||||
*
|
*
|
||||||
* SPDX-License-Identifier: MIT
|
* SPDX-License-Identifier: MIT
|
||||||
*
|
*
|
||||||
|
@ -14,7 +14,7 @@
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
|
|
||||||
namespace NEO {
|
namespace NEO {
|
||||||
class GfxCoreHelper;
|
struct RootDeviceEnvironment;
|
||||||
inline uint32_t getNumGrfsPerLocalIdCoordinate(uint32_t simd, uint32_t grfSize) {
|
inline uint32_t getNumGrfsPerLocalIdCoordinate(uint32_t simd, uint32_t grfSize) {
|
||||||
return (simd == 32 && grfSize == 32) ? 2 : 1;
|
return (simd == 32 && grfSize == 32) ? 2 : 1;
|
||||||
}
|
}
|
||||||
|
@ -64,7 +64,7 @@ void generateLocalIDsSimd(void *b, const std::array<uint16_t, 3> &localWorkgroup
|
||||||
const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize);
|
const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize);
|
||||||
|
|
||||||
void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize,
|
void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize,
|
||||||
const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize, const NEO::GfxCoreHelper &gfxCoreHelper);
|
const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize, const RootDeviceEnvironment &rootDeviceEnvironment);
|
||||||
void generateLocalIDsWithLayoutForImages(void *b, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t simd);
|
void generateLocalIDsWithLayoutForImages(void *b, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t simd);
|
||||||
|
|
||||||
bool isCompatibleWithLayoutForImages(const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, uint16_t simd);
|
bool isCompatibleWithLayoutForImages(const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, uint16_t simd);
|
||||||
|
|
|
@ -1,11 +1,12 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (C) 2018-2023 Intel Corporation
|
* Copyright (C) 2018-2024 Intel Corporation
|
||||||
*
|
*
|
||||||
* SPDX-License-Identifier: MIT
|
* SPDX-License-Identifier: MIT
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
#include "shared/source/execution_environment/root_device_environment.h"
|
||||||
#include "shared/source/helpers/gfx_core_helper.h"
|
#include "shared/source/helpers/gfx_core_helper.h"
|
||||||
#include "shared/source/helpers/local_id_gen.h"
|
#include "shared/source/helpers/local_id_gen.h"
|
||||||
#include "shared/source/helpers/simd_helper.h"
|
#include "shared/source/helpers/simd_helper.h"
|
||||||
|
@ -23,12 +24,13 @@ struct PerThreadDataHelper {
|
||||||
uint32_t numChannels,
|
uint32_t numChannels,
|
||||||
size_t localWorkSize,
|
size_t localWorkSize,
|
||||||
bool isHwLocalIdGeneration,
|
bool isHwLocalIdGeneration,
|
||||||
const GfxCoreHelper &gfxCoreHelper) {
|
const RootDeviceEnvironment &rootDeviceEnvironment) {
|
||||||
auto perThreadSizeLocalIDs = static_cast<size_t>(getPerThreadSizeLocalIDs(simd, grfSize, numChannels));
|
auto perThreadSizeLocalIDs = static_cast<size_t>(getPerThreadSizeLocalIDs(simd, grfSize, numChannels));
|
||||||
if (isSimd1(simd)) {
|
if (isSimd1(simd)) {
|
||||||
return perThreadSizeLocalIDs * localWorkSize;
|
return perThreadSizeLocalIDs * localWorkSize;
|
||||||
}
|
}
|
||||||
return perThreadSizeLocalIDs * gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkSize), grfSize, isHwLocalIdGeneration);
|
auto &gfxCoreHelper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
|
||||||
|
return perThreadSizeLocalIDs * gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkSize), grfSize, isHwLocalIdGeneration, rootDeviceEnvironment);
|
||||||
}
|
}
|
||||||
}; // namespace PerThreadDataHelper
|
}; // namespace PerThreadDataHelper
|
||||||
} // namespace NEO
|
} // namespace NEO
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (C) 2018-2023 Intel Corporation
|
* Copyright (C) 2018-2024 Intel Corporation
|
||||||
*
|
*
|
||||||
* SPDX-License-Identifier: MIT
|
* SPDX-License-Identifier: MIT
|
||||||
*
|
*
|
||||||
|
@ -7,6 +7,7 @@
|
||||||
|
|
||||||
#include "shared/source/helpers/local_id_gen.h"
|
#include "shared/source/helpers/local_id_gen.h"
|
||||||
|
|
||||||
|
#include "shared/source/execution_environment/root_device_environment.h"
|
||||||
#include "shared/source/helpers/aligned_memory.h"
|
#include "shared/source/helpers/aligned_memory.h"
|
||||||
#include "shared/source/helpers/gfx_core_helper.h"
|
#include "shared/source/helpers/gfx_core_helper.h"
|
||||||
#include "shared/source/helpers/local_id_gen_special.inl"
|
#include "shared/source/helpers/local_id_gen_special.inl"
|
||||||
|
@ -45,9 +46,10 @@ LocalIDHelper::LocalIDHelper() {
|
||||||
LocalIDHelper LocalIDHelper::initializer;
|
LocalIDHelper LocalIDHelper::initializer;
|
||||||
|
|
||||||
// traditional function to generate local IDs
|
// traditional function to generate local IDs
|
||||||
void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize, const GfxCoreHelper &gfxCoreHelper) {
|
void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize, const RootDeviceEnvironment &rootDeviceEnvironment) {
|
||||||
bool localIdsGeneratedByHw = false;
|
bool localIdsGeneratedByHw = false;
|
||||||
auto threadsPerWorkGroup = static_cast<uint16_t>(gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]), grfSize, localIdsGeneratedByHw));
|
auto &gfxCoreHelper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
|
||||||
|
auto threadsPerWorkGroup = static_cast<uint16_t>(gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]), grfSize, localIdsGeneratedByHw, rootDeviceEnvironment));
|
||||||
bool useLayoutForImages = isImageOnlyKernel && isCompatibleWithLayoutForImages(localWorkgroupSize, dimensionsOrder, simd);
|
bool useLayoutForImages = isImageOnlyKernel && isCompatibleWithLayoutForImages(localWorkgroupSize, dimensionsOrder, simd);
|
||||||
if (useLayoutForImages) {
|
if (useLayoutForImages) {
|
||||||
generateLocalIDsWithLayoutForImages(buffer, localWorkgroupSize, simd);
|
generateLocalIDsWithLayoutForImages(buffer, localWorkgroupSize, simd);
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (C) 2022-2023 Intel Corporation
|
* Copyright (C) 2022-2024 Intel Corporation
|
||||||
*
|
*
|
||||||
* SPDX-License-Identifier: MIT
|
* SPDX-License-Identifier: MIT
|
||||||
*
|
*
|
||||||
|
@ -7,6 +7,7 @@
|
||||||
|
|
||||||
#include "shared/source/kernel/implicit_args_helper.h"
|
#include "shared/source/kernel/implicit_args_helper.h"
|
||||||
|
|
||||||
|
#include "shared/source/execution_environment/root_device_environment.h"
|
||||||
#include "shared/source/helpers/aligned_memory.h"
|
#include "shared/source/helpers/aligned_memory.h"
|
||||||
#include "shared/source/helpers/basic_math.h"
|
#include "shared/source/helpers/basic_math.h"
|
||||||
#include "shared/source/helpers/hw_walk_order.h"
|
#include "shared/source/helpers/hw_walk_order.h"
|
||||||
|
@ -43,7 +44,7 @@ uint32_t getGrfSize(uint32_t simd) {
|
||||||
return 32u;
|
return 32u;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t getSizeForImplicitArgsPatching(const ImplicitArgs *pImplicitArgs, const KernelDescriptor &kernelDescriptor, bool isHwLocalIdGeneration, const GfxCoreHelper &gfxCoreHelper) {
|
uint32_t getSizeForImplicitArgsPatching(const ImplicitArgs *pImplicitArgs, const KernelDescriptor &kernelDescriptor, bool isHwLocalIdGeneration, const RootDeviceEnvironment &rootDeviceEnvironment) {
|
||||||
if (!pImplicitArgs) {
|
if (!pImplicitArgs) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -58,15 +59,15 @@ uint32_t getSizeForImplicitArgsPatching(const ImplicitArgs *pImplicitArgs, const
|
||||||
auto itemsInGroup = Math::computeTotalElementsCount(localWorkSize);
|
auto itemsInGroup = Math::computeTotalElementsCount(localWorkSize);
|
||||||
uint32_t localIdsSizeNeeded =
|
uint32_t localIdsSizeNeeded =
|
||||||
alignUp(static_cast<uint32_t>(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(
|
alignUp(static_cast<uint32_t>(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(
|
||||||
simdSize, grfSize, 3u, itemsInGroup, isHwLocalIdGeneration, gfxCoreHelper)),
|
simdSize, grfSize, 3u, itemsInGroup, isHwLocalIdGeneration, rootDeviceEnvironment)),
|
||||||
MemoryConstants::cacheLineSize);
|
MemoryConstants::cacheLineSize);
|
||||||
return implicitArgsSize + localIdsSizeNeeded;
|
return implicitArgsSize + localIdsSizeNeeded;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void *patchImplicitArgs(void *ptrToPatch, const ImplicitArgs &implicitArgs, const KernelDescriptor &kernelDescriptor, std::optional<std::pair<bool, uint32_t>> hwGenerationOfLocalIdsParams, const GfxCoreHelper &gfxCoreHelper) {
|
void *patchImplicitArgs(void *ptrToPatch, const ImplicitArgs &implicitArgs, const KernelDescriptor &kernelDescriptor, std::optional<std::pair<bool, uint32_t>> hwGenerationOfLocalIdsParams, const RootDeviceEnvironment &rootDeviceEnvironment) {
|
||||||
auto localIdsGeneratedByHw = hwGenerationOfLocalIdsParams.has_value() ? hwGenerationOfLocalIdsParams.value().first : false;
|
auto localIdsGeneratedByHw = hwGenerationOfLocalIdsParams.has_value() ? hwGenerationOfLocalIdsParams.value().first : false;
|
||||||
auto totalSizeToProgram = getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, localIdsGeneratedByHw, gfxCoreHelper);
|
auto totalSizeToProgram = getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, localIdsGeneratedByHw, rootDeviceEnvironment);
|
||||||
auto retVal = ptrOffset(ptrToPatch, totalSizeToProgram);
|
auto retVal = ptrOffset(ptrToPatch, totalSizeToProgram);
|
||||||
|
|
||||||
auto patchImplicitArgsBufferInCrossThread = NEO::isValidOffset<>(kernelDescriptor.payloadMappings.implicitArgs.implicitArgsBuffer);
|
auto patchImplicitArgsBufferInCrossThread = NEO::isValidOffset<>(kernelDescriptor.payloadMappings.implicitArgs.implicitArgsBuffer);
|
||||||
|
@ -82,7 +83,7 @@ void *patchImplicitArgs(void *ptrToPatch, const ImplicitArgs &implicitArgs, cons
|
||||||
static_cast<uint16_t>(implicitArgs.localSizeY),
|
static_cast<uint16_t>(implicitArgs.localSizeY),
|
||||||
static_cast<uint16_t>(implicitArgs.localSizeZ)}},
|
static_cast<uint16_t>(implicitArgs.localSizeZ)}},
|
||||||
dimensionOrder,
|
dimensionOrder,
|
||||||
false, grfSize, gfxCoreHelper);
|
false, grfSize, rootDeviceEnvironment);
|
||||||
auto sizeForLocalIdsProgramming = totalSizeToProgram - ImplicitArgs::getSize();
|
auto sizeForLocalIdsProgramming = totalSizeToProgram - ImplicitArgs::getSize();
|
||||||
ptrToPatch = ptrOffset(ptrToPatch, sizeForLocalIdsProgramming);
|
ptrToPatch = ptrOffset(ptrToPatch, sizeForLocalIdsProgramming);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (C) 2021-2023 Intel Corporation
|
* Copyright (C) 2021-2024 Intel Corporation
|
||||||
*
|
*
|
||||||
* SPDX-License-Identifier: MIT
|
* SPDX-License-Identifier: MIT
|
||||||
*
|
*
|
||||||
|
@ -17,14 +17,14 @@
|
||||||
namespace NEO {
|
namespace NEO {
|
||||||
|
|
||||||
struct KernelDescriptor;
|
struct KernelDescriptor;
|
||||||
class GfxCoreHelper;
|
struct RootDeviceEnvironment;
|
||||||
|
|
||||||
inline constexpr const char *implicitArgsRelocationSymbolName = "__INTEL_PATCH_CROSS_THREAD_OFFSET_OFF_R0";
|
inline constexpr const char *implicitArgsRelocationSymbolName = "__INTEL_PATCH_CROSS_THREAD_OFFSET_OFF_R0";
|
||||||
|
|
||||||
namespace ImplicitArgsHelper {
|
namespace ImplicitArgsHelper {
|
||||||
std::array<uint8_t, 3> getDimensionOrderForLocalIds(const uint8_t *workgroupDimensionsOrder, std::optional<std::pair<bool /* localIdsGeneratedByRuntime */, uint32_t /* walkOrderForHwGenerationOfLocalIds */>> hwGenerationOfLocalIdsParams);
|
std::array<uint8_t, 3> getDimensionOrderForLocalIds(const uint8_t *workgroupDimensionsOrder, std::optional<std::pair<bool /* localIdsGeneratedByRuntime */, uint32_t /* walkOrderForHwGenerationOfLocalIds */>> hwGenerationOfLocalIdsParams);
|
||||||
uint32_t getGrfSize(uint32_t simd);
|
uint32_t getGrfSize(uint32_t simd);
|
||||||
uint32_t getSizeForImplicitArgsPatching(const ImplicitArgs *pImplicitArgs, const KernelDescriptor &kernelDescriptor, bool localIdsGeneratedByRuntime, const GfxCoreHelper &gfxCoreHelper);
|
uint32_t getSizeForImplicitArgsPatching(const ImplicitArgs *pImplicitArgs, const KernelDescriptor &kernelDescriptor, bool localIdsGeneratedByRuntime, const RootDeviceEnvironment &rootDeviceEnvironment);
|
||||||
void *patchImplicitArgs(void *ptrToPatch, const ImplicitArgs &implicitArgs, const KernelDescriptor &kernelDescriptor, std::optional<std::pair<bool /* localIdsGeneratedByRuntime */, uint32_t /* walkOrderForHwGenerationOfLocalIds */>> hwGenerationOfLocalIdsParams, const GfxCoreHelper &gfxCoreHelper);
|
void *patchImplicitArgs(void *ptrToPatch, const ImplicitArgs &implicitArgs, const KernelDescriptor &kernelDescriptor, std::optional<std::pair<bool /* localIdsGeneratedByRuntime */, uint32_t /* walkOrderForHwGenerationOfLocalIds */>> hwGenerationOfLocalIdsParams, const RootDeviceEnvironment &rootDeviceEnvironment);
|
||||||
} // namespace ImplicitArgsHelper
|
} // namespace ImplicitArgsHelper
|
||||||
} // namespace NEO
|
} // namespace NEO
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (C) 2022-2023 Intel Corporation
|
* Copyright (C) 2022-2024 Intel Corporation
|
||||||
*
|
*
|
||||||
* SPDX-License-Identifier: MIT
|
* SPDX-License-Identifier: MIT
|
||||||
*
|
*
|
||||||
|
@ -7,6 +7,7 @@
|
||||||
|
|
||||||
#include "shared/source/kernel/local_ids_cache.h"
|
#include "shared/source/kernel/local_ids_cache.h"
|
||||||
|
|
||||||
|
#include "shared/source/execution_environment/root_device_environment.h"
|
||||||
#include "shared/source/helpers/aligned_memory.h"
|
#include "shared/source/helpers/aligned_memory.h"
|
||||||
#include "shared/source/helpers/basic_math.h"
|
#include "shared/source/helpers/basic_math.h"
|
||||||
#include "shared/source/helpers/gfx_core_helper.h"
|
#include "shared/source/helpers/gfx_core_helper.h"
|
||||||
|
@ -34,12 +35,13 @@ std::unique_lock<std::mutex> LocalIdsCache::lock() {
|
||||||
return std::unique_lock<std::mutex>(setLocalIdsMutex);
|
return std::unique_lock<std::mutex>(setLocalIdsMutex);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t LocalIdsCache::getLocalIdsSizeForGroup(const Vec3<uint16_t> &group, const GfxCoreHelper &gfxCoreHelper) const {
|
size_t LocalIdsCache::getLocalIdsSizeForGroup(const Vec3<uint16_t> &group, const RootDeviceEnvironment &rootDeviceEnvironment) const {
|
||||||
const auto numElementsInGroup = static_cast<uint32_t>(Math::computeTotalElementsCount({group[0], group[1], group[2]}));
|
const auto numElementsInGroup = static_cast<uint32_t>(Math::computeTotalElementsCount({group[0], group[1], group[2]}));
|
||||||
if (isSimd1(simdSize)) {
|
if (isSimd1(simdSize)) {
|
||||||
return static_cast<size_t>(numElementsInGroup * localIdsSizePerThread);
|
return static_cast<size_t>(numElementsInGroup * localIdsSizePerThread);
|
||||||
}
|
}
|
||||||
const auto numberOfThreads = gfxCoreHelper.calculateNumThreadsPerThreadGroup(simdSize, numElementsInGroup, grfSize, false);
|
auto &gfxCoreHelper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
|
||||||
|
const auto numberOfThreads = gfxCoreHelper.calculateNumThreadsPerThreadGroup(simdSize, numElementsInGroup, grfSize, false, rootDeviceEnvironment);
|
||||||
return static_cast<size_t>(numberOfThreads * localIdsSizePerThread);
|
return static_cast<size_t>(numberOfThreads * localIdsSizePerThread);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -52,7 +54,7 @@ void LocalIdsCache::setLocalIdsForEntry(LocalIdsCacheEntry &entry, void *destina
|
||||||
std::memcpy(destination, entry.localIdsData, entry.localIdsSize);
|
std::memcpy(destination, entry.localIdsData, entry.localIdsSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
void LocalIdsCache::setLocalIdsForGroup(const Vec3<uint16_t> &group, void *destination, const GfxCoreHelper &gfxCoreHelper) {
|
void LocalIdsCache::setLocalIdsForGroup(const Vec3<uint16_t> &group, void *destination, const RootDeviceEnvironment &rootDeviceEnvironment) {
|
||||||
auto setLocalIdsLock = lock();
|
auto setLocalIdsLock = lock();
|
||||||
LocalIdsCacheEntry *leastAccessedEntry = &cache[0];
|
LocalIdsCacheEntry *leastAccessedEntry = &cache[0];
|
||||||
for (auto &cacheEntry : cache) {
|
for (auto &cacheEntry : cache) {
|
||||||
|
@ -65,12 +67,12 @@ void LocalIdsCache::setLocalIdsForGroup(const Vec3<uint16_t> &group, void *desti
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
commitNewEntry(*leastAccessedEntry, group, gfxCoreHelper);
|
commitNewEntry(*leastAccessedEntry, group, rootDeviceEnvironment);
|
||||||
setLocalIdsForEntry(*leastAccessedEntry, destination);
|
setLocalIdsForEntry(*leastAccessedEntry, destination);
|
||||||
}
|
}
|
||||||
|
|
||||||
void LocalIdsCache::commitNewEntry(LocalIdsCacheEntry &entry, const Vec3<uint16_t> &group, const GfxCoreHelper &gfxCoreHelper) {
|
void LocalIdsCache::commitNewEntry(LocalIdsCacheEntry &entry, const Vec3<uint16_t> &group, const RootDeviceEnvironment &rootDeviceEnvironment) {
|
||||||
entry.localIdsSize = getLocalIdsSizeForGroup(group, gfxCoreHelper);
|
entry.localIdsSize = getLocalIdsSizeForGroup(group, rootDeviceEnvironment);
|
||||||
entry.groupSize = group;
|
entry.groupSize = group;
|
||||||
entry.accessCounter = 0U;
|
entry.accessCounter = 0U;
|
||||||
if (entry.localIdsSize > entry.localIdsSizeAllocated) {
|
if (entry.localIdsSize > entry.localIdsSizeAllocated) {
|
||||||
|
@ -79,7 +81,7 @@ void LocalIdsCache::commitNewEntry(LocalIdsCacheEntry &entry, const Vec3<uint16_
|
||||||
entry.localIdsSizeAllocated = entry.localIdsSize;
|
entry.localIdsSizeAllocated = entry.localIdsSize;
|
||||||
}
|
}
|
||||||
NEO::generateLocalIDs(entry.localIdsData, static_cast<uint16_t>(simdSize),
|
NEO::generateLocalIDs(entry.localIdsData, static_cast<uint16_t>(simdSize),
|
||||||
{group[0], group[1], group[2]}, wgDimOrder, usesOnlyImages, grfSize, gfxCoreHelper);
|
{group[0], group[1], group[2]}, wgDimOrder, usesOnlyImages, grfSize, rootDeviceEnvironment);
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace NEO
|
} // namespace NEO
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (C) 2022-2023 Intel Corporation
|
* Copyright (C) 2022-2024 Intel Corporation
|
||||||
*
|
*
|
||||||
* SPDX-License-Identifier: MIT
|
* SPDX-License-Identifier: MIT
|
||||||
*
|
*
|
||||||
|
@ -12,7 +12,7 @@
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
|
|
||||||
namespace NEO {
|
namespace NEO {
|
||||||
class GfxCoreHelper;
|
struct RootDeviceEnvironment;
|
||||||
class LocalIdsCache {
|
class LocalIdsCache {
|
||||||
public:
|
public:
|
||||||
struct LocalIdsCacheEntry {
|
struct LocalIdsCacheEntry {
|
||||||
|
@ -30,13 +30,13 @@ class LocalIdsCache {
|
||||||
LocalIdsCache(size_t cacheSize, std::array<uint8_t, 3> wgDimOrder, uint8_t simdSize, uint8_t grfSize, bool usesOnlyImages = false);
|
LocalIdsCache(size_t cacheSize, std::array<uint8_t, 3> wgDimOrder, uint8_t simdSize, uint8_t grfSize, bool usesOnlyImages = false);
|
||||||
~LocalIdsCache();
|
~LocalIdsCache();
|
||||||
|
|
||||||
void setLocalIdsForGroup(const Vec3<uint16_t> &group, void *destination, const GfxCoreHelper &gfxCoreHelper);
|
void setLocalIdsForGroup(const Vec3<uint16_t> &group, void *destination, const RootDeviceEnvironment &rootDeviceEnvironment);
|
||||||
size_t getLocalIdsSizeForGroup(const Vec3<uint16_t> &group, const GfxCoreHelper &gfxCoreHelper) const;
|
size_t getLocalIdsSizeForGroup(const Vec3<uint16_t> &group, const RootDeviceEnvironment &rootDeviceEnvironment) const;
|
||||||
size_t getLocalIdsSizePerThread() const;
|
size_t getLocalIdsSizePerThread() const;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
void setLocalIdsForEntry(LocalIdsCacheEntry &entry, void *destination);
|
void setLocalIdsForEntry(LocalIdsCacheEntry &entry, void *destination);
|
||||||
void commitNewEntry(LocalIdsCacheEntry &entry, const Vec3<uint16_t> &group, const GfxCoreHelper &gfxCoreHelper);
|
void commitNewEntry(LocalIdsCacheEntry &entry, const Vec3<uint16_t> &group, const RootDeviceEnvironment &rootDeviceEnvironment);
|
||||||
std::unique_lock<std::mutex> lock();
|
std::unique_lock<std::mutex> lock();
|
||||||
|
|
||||||
StackVec<LocalIdsCacheEntry, 4> cache;
|
StackVec<LocalIdsCacheEntry, 4> cache;
|
||||||
|
|
|
@ -1589,25 +1589,27 @@ HWTEST_F(GfxCoreHelperTest, GivenCooperativeEngineSupportedAndNotUsedWhenAdjustM
|
||||||
|
|
||||||
HWTEST_F(GfxCoreHelperTest, givenNumGrfAndSimdSizeWhenAdjustingMaxWorkGroupSizeThenAlwaysReturnDeviceDefault) {
|
HWTEST_F(GfxCoreHelperTest, givenNumGrfAndSimdSizeWhenAdjustingMaxWorkGroupSizeThenAlwaysReturnDeviceDefault) {
|
||||||
const auto &gfxCoreHelper = getHelper<GfxCoreHelper>();
|
const auto &gfxCoreHelper = getHelper<GfxCoreHelper>();
|
||||||
|
const auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment();
|
||||||
constexpr auto defaultMaxGroupSize = 1024u;
|
constexpr auto defaultMaxGroupSize = 1024u;
|
||||||
|
|
||||||
uint32_t simdSize = 16u;
|
uint32_t simdSize = 16u;
|
||||||
uint32_t isHwLocalIdGeneration = true;
|
uint32_t isHwLocalIdGeneration = true;
|
||||||
uint32_t numGrfRequired = GrfConfig::largeGrfNumber;
|
uint32_t numGrfRequired = GrfConfig::largeGrfNumber;
|
||||||
EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.adjustMaxWorkGroupSize(numGrfRequired, simdSize, isHwLocalIdGeneration, defaultMaxGroupSize));
|
EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.adjustMaxWorkGroupSize(numGrfRequired, simdSize, isHwLocalIdGeneration, defaultMaxGroupSize, rootDeviceEnvironment));
|
||||||
|
|
||||||
simdSize = 32u;
|
simdSize = 32u;
|
||||||
numGrfRequired = GrfConfig::largeGrfNumber;
|
numGrfRequired = GrfConfig::largeGrfNumber;
|
||||||
EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.adjustMaxWorkGroupSize(numGrfRequired, simdSize, isHwLocalIdGeneration, defaultMaxGroupSize));
|
EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.adjustMaxWorkGroupSize(numGrfRequired, simdSize, isHwLocalIdGeneration, defaultMaxGroupSize, rootDeviceEnvironment));
|
||||||
|
|
||||||
simdSize = 16u;
|
simdSize = 16u;
|
||||||
isHwLocalIdGeneration = false;
|
isHwLocalIdGeneration = false;
|
||||||
numGrfRequired = GrfConfig::defaultGrfNumber;
|
numGrfRequired = GrfConfig::defaultGrfNumber;
|
||||||
EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.adjustMaxWorkGroupSize(numGrfRequired, simdSize, isHwLocalIdGeneration, defaultMaxGroupSize));
|
EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.adjustMaxWorkGroupSize(numGrfRequired, simdSize, isHwLocalIdGeneration, defaultMaxGroupSize, rootDeviceEnvironment));
|
||||||
}
|
}
|
||||||
|
|
||||||
HWTEST2_F(GfxCoreHelperTest, givenParamsWhenCalculateNumThreadsPerThreadGroupThenMethodReturnProperValue, IsAtMostXeHpcCore) {
|
HWTEST2_F(GfxCoreHelperTest, givenParamsWhenCalculateNumThreadsPerThreadGroupThenMethodReturnProperValue, IsAtMostXeHpcCore) {
|
||||||
auto &gfxCoreHelper = getHelper<GfxCoreHelper>();
|
auto &gfxCoreHelper = getHelper<GfxCoreHelper>();
|
||||||
|
const auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment();
|
||||||
std::array<std::array<uint32_t, 3>, 8> values = {{
|
std::array<std::array<uint32_t, 3>, 8> values = {{
|
||||||
{32u, 32u, 1u}, // SIMT Size, totalWorkItems, Max Num of threads
|
{32u, 32u, 1u}, // SIMT Size, totalWorkItems, Max Num of threads
|
||||||
{32u, 64u, 2u},
|
{32u, 64u, 2u},
|
||||||
|
@ -1620,7 +1622,7 @@ HWTEST2_F(GfxCoreHelperTest, givenParamsWhenCalculateNumThreadsPerThreadGroupThe
|
||||||
}};
|
}};
|
||||||
|
|
||||||
for (auto &[simtSize, totalWgSize, expectedNumThreadsPerThreadGroup] : values) {
|
for (auto &[simtSize, totalWgSize, expectedNumThreadsPerThreadGroup] : values) {
|
||||||
EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(simtSize, totalWgSize, 32u, true));
|
EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(simtSize, totalWgSize, 32u, true, rootDeviceEnvironment));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1634,6 +1636,7 @@ HWTEST_F(GfxCoreHelperTest, givenFlagRemoveRestrictionsOnNumberOfThreadsInGpgpuT
|
||||||
DebugManagerStateRestore dbgRestore;
|
DebugManagerStateRestore dbgRestore;
|
||||||
debugManager.flags.RemoveRestrictionsOnNumberOfThreadsInGpgpuThreadGroup.set(1);
|
debugManager.flags.RemoveRestrictionsOnNumberOfThreadsInGpgpuThreadGroup.set(1);
|
||||||
const auto &gfxCoreHelper = getHelper<GfxCoreHelper>();
|
const auto &gfxCoreHelper = getHelper<GfxCoreHelper>();
|
||||||
|
const auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment();
|
||||||
|
|
||||||
std::array<std::array<uint32_t, 5>, 8> values = {{
|
std::array<std::array<uint32_t, 5>, 8> values = {{
|
||||||
{32u, 32u, 128u, 1, 1u}, // SIMT Size, totalWorkItems, Max Num of threads, Grf size, Hw local id generation
|
{32u, 32u, 128u, 1, 1u}, // SIMT Size, totalWorkItems, Max Num of threads, Grf size, Hw local id generation
|
||||||
|
@ -1647,7 +1650,7 @@ HWTEST_F(GfxCoreHelperTest, givenFlagRemoveRestrictionsOnNumberOfThreadsInGpgpuT
|
||||||
}};
|
}};
|
||||||
|
|
||||||
for (auto &[simtSize, totalWgSize, grfsize, isHwLocalIdGeneration, expectedNumThreadsPerThreadGroup] : values) {
|
for (auto &[simtSize, totalWgSize, grfsize, isHwLocalIdGeneration, expectedNumThreadsPerThreadGroup] : values) {
|
||||||
EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(simtSize, totalWgSize, grfsize, isHwLocalIdGeneration));
|
EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(simtSize, totalWgSize, grfsize, isHwLocalIdGeneration, rootDeviceEnvironment));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (C) 2018-2023 Intel Corporation
|
* Copyright (C) 2018-2024 Intel Corporation
|
||||||
*
|
*
|
||||||
* SPDX-License-Identifier: MIT
|
* SPDX-License-Identifier: MIT
|
||||||
*
|
*
|
||||||
|
@ -12,6 +12,7 @@
|
||||||
#include "shared/source/helpers/ptr_math.h"
|
#include "shared/source/helpers/ptr_math.h"
|
||||||
#include "shared/test/common/helpers/default_hw_info.h"
|
#include "shared/test/common/helpers/default_hw_info.h"
|
||||||
#include "shared/test/common/helpers/unit_test_helper.h"
|
#include "shared/test/common/helpers/unit_test_helper.h"
|
||||||
|
#include "shared/test/common/mocks/mock_execution_environment.h"
|
||||||
#include "shared/test/common/test_macros/hw_test.h"
|
#include "shared/test/common/test_macros/hw_test.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
@ -112,8 +113,9 @@ TEST(LocalIdTest, givenVariadicGrfSizeWhenLocalSizesAreEmittedThenUseFullRowSize
|
||||||
std::array<uint16_t, 3u> localSizes = {{2u, 2u, 1u}};
|
std::array<uint16_t, 3u> localSizes = {{2u, 2u, 1u}};
|
||||||
std::array<uint8_t, 3u> dimensionsOrder = {{0u, 1u, 2u}};
|
std::array<uint8_t, 3u> dimensionsOrder = {{0u, 1u, 2u}};
|
||||||
|
|
||||||
auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily);
|
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
|
||||||
generateLocalIDs(localIdsPtr.get(), 16u, localSizes, dimensionsOrder, false, 64u, *gfxCoreHelper.get());
|
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
|
||||||
|
generateLocalIDs(localIdsPtr.get(), 16u, localSizes, dimensionsOrder, false, 64u, rootDeviceEnvironment);
|
||||||
EXPECT_EQ(localIdsView[0], 0u);
|
EXPECT_EQ(localIdsView[0], 0u);
|
||||||
EXPECT_EQ(localIdsView[1], 1u);
|
EXPECT_EQ(localIdsView[1], 1u);
|
||||||
EXPECT_EQ(localIdsView[2], 0u);
|
EXPECT_EQ(localIdsView[2], 0u);
|
||||||
|
@ -308,42 +310,47 @@ struct LocalIDFixture : ::testing::TestWithParam<std::tuple<int, int, int, int,
|
||||||
};
|
};
|
||||||
|
|
||||||
HWTEST_P(LocalIDFixture, WhenGeneratingLocalIdsThenIdsAreWithinLimits) {
|
HWTEST_P(LocalIDFixture, WhenGeneratingLocalIdsThenIdsAreWithinLimits) {
|
||||||
auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily);
|
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
|
||||||
|
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
|
||||||
generateLocalIDs(buffer, simd, std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizeX), static_cast<uint16_t>(localWorkSizeY), static_cast<uint16_t>(localWorkSizeZ)}},
|
generateLocalIDs(buffer, simd, std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizeX), static_cast<uint16_t>(localWorkSizeY), static_cast<uint16_t>(localWorkSizeZ)}},
|
||||||
std::array<uint8_t, 3>{{0, 1, 2}}, false, grfSize, *gfxCoreHelper.get());
|
std::array<uint8_t, 3>{{0, 1, 2}}, false, grfSize, rootDeviceEnvironment);
|
||||||
validateIDWithinLimits(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, UnitTestHelper<FamilyType>::useFullRowForLocalIdsGeneration);
|
validateIDWithinLimits(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, UnitTestHelper<FamilyType>::useFullRowForLocalIdsGeneration);
|
||||||
}
|
}
|
||||||
|
|
||||||
HWTEST_P(LocalIDFixture, WhenGeneratingLocalIdsThenAllWorkItemsCovered) {
|
HWTEST_P(LocalIDFixture, WhenGeneratingLocalIdsThenAllWorkItemsCovered) {
|
||||||
auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily);
|
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
|
||||||
|
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
|
||||||
generateLocalIDs(buffer, simd, std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizeX), static_cast<uint16_t>(localWorkSizeY), static_cast<uint16_t>(localWorkSizeZ)}},
|
generateLocalIDs(buffer, simd, std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizeX), static_cast<uint16_t>(localWorkSizeY), static_cast<uint16_t>(localWorkSizeZ)}},
|
||||||
std::array<uint8_t, 3>{{0, 1, 2}}, false, grfSize, *gfxCoreHelper.get());
|
std::array<uint8_t, 3>{{0, 1, 2}}, false, grfSize, rootDeviceEnvironment);
|
||||||
validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, UnitTestHelper<FamilyType>::useFullRowForLocalIdsGeneration);
|
validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, UnitTestHelper<FamilyType>::useFullRowForLocalIdsGeneration);
|
||||||
}
|
}
|
||||||
|
|
||||||
HWTEST_P(LocalIDFixture, WhenWalkOrderIsXyzThenProperLocalIdsAreGenerated) {
|
HWTEST_P(LocalIDFixture, WhenWalkOrderIsXyzThenProperLocalIdsAreGenerated) {
|
||||||
auto dimensionsOrder = std::array<uint8_t, 3>{{0, 1, 2}};
|
auto dimensionsOrder = std::array<uint8_t, 3>{{0, 1, 2}};
|
||||||
auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily);
|
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
|
||||||
|
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
|
||||||
generateLocalIDs(buffer, simd, std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizeX), static_cast<uint16_t>(localWorkSizeY), static_cast<uint16_t>(localWorkSizeZ)}},
|
generateLocalIDs(buffer, simd, std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizeX), static_cast<uint16_t>(localWorkSizeY), static_cast<uint16_t>(localWorkSizeZ)}},
|
||||||
dimensionsOrder, false, grfSize, *gfxCoreHelper.get());
|
dimensionsOrder, false, grfSize, rootDeviceEnvironment);
|
||||||
validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, UnitTestHelper<FamilyType>::useFullRowForLocalIdsGeneration);
|
validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, UnitTestHelper<FamilyType>::useFullRowForLocalIdsGeneration);
|
||||||
validateWalkOrder(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, dimensionsOrder);
|
validateWalkOrder(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, dimensionsOrder);
|
||||||
}
|
}
|
||||||
|
|
||||||
HWTEST_P(LocalIDFixture, WhenWalkOrderIsYxzThenProperLocalIdsAreGenerated) {
|
HWTEST_P(LocalIDFixture, WhenWalkOrderIsYxzThenProperLocalIdsAreGenerated) {
|
||||||
auto dimensionsOrder = std::array<uint8_t, 3>{{1, 0, 2}};
|
auto dimensionsOrder = std::array<uint8_t, 3>{{1, 0, 2}};
|
||||||
auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily);
|
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
|
||||||
|
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
|
||||||
generateLocalIDs(buffer, simd, std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizeX), static_cast<uint16_t>(localWorkSizeY), static_cast<uint16_t>(localWorkSizeZ)}},
|
generateLocalIDs(buffer, simd, std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizeX), static_cast<uint16_t>(localWorkSizeY), static_cast<uint16_t>(localWorkSizeZ)}},
|
||||||
dimensionsOrder, false, grfSize, *gfxCoreHelper.get());
|
dimensionsOrder, false, grfSize, rootDeviceEnvironment);
|
||||||
validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, UnitTestHelper<FamilyType>::useFullRowForLocalIdsGeneration);
|
validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, UnitTestHelper<FamilyType>::useFullRowForLocalIdsGeneration);
|
||||||
validateWalkOrder(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, dimensionsOrder);
|
validateWalkOrder(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, dimensionsOrder);
|
||||||
}
|
}
|
||||||
|
|
||||||
HWTEST_P(LocalIDFixture, WhenWalkOrderIsZyxThenProperLocalIdsAreGenerated) {
|
HWTEST_P(LocalIDFixture, WhenWalkOrderIsZyxThenProperLocalIdsAreGenerated) {
|
||||||
auto dimensionsOrder = std::array<uint8_t, 3>{{2, 1, 0}};
|
auto dimensionsOrder = std::array<uint8_t, 3>{{2, 1, 0}};
|
||||||
auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily);
|
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
|
||||||
|
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
|
||||||
generateLocalIDs(buffer, simd, std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizeX), static_cast<uint16_t>(localWorkSizeY), static_cast<uint16_t>(localWorkSizeZ)}},
|
generateLocalIDs(buffer, simd, std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizeX), static_cast<uint16_t>(localWorkSizeY), static_cast<uint16_t>(localWorkSizeZ)}},
|
||||||
dimensionsOrder, false, grfSize, *gfxCoreHelper.get());
|
dimensionsOrder, false, grfSize, rootDeviceEnvironment);
|
||||||
validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, UnitTestHelper<FamilyType>::useFullRowForLocalIdsGeneration);
|
validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, UnitTestHelper<FamilyType>::useFullRowForLocalIdsGeneration);
|
||||||
validateWalkOrder(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, dimensionsOrder);
|
validateWalkOrder(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, dimensionsOrder);
|
||||||
}
|
}
|
||||||
|
@ -383,8 +390,9 @@ struct LocalIdsLayoutForImagesTest : ::testing::TestWithParam<std::tuple<uint16_
|
||||||
memset(memory.get(), 0xff, size);
|
memset(memory.get(), 0xff, size);
|
||||||
buffer = reinterpret_cast<uint16_t *>(memory.get());
|
buffer = reinterpret_cast<uint16_t *>(memory.get());
|
||||||
EXPECT_TRUE(isCompatibleWithLayoutForImages(localWorkSize, dimensionsOrder, simd));
|
EXPECT_TRUE(isCompatibleWithLayoutForImages(localWorkSize, dimensionsOrder, simd));
|
||||||
auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily);
|
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
|
||||||
generateLocalIDs(buffer, simd, localWorkSize, dimensionsOrder, true, grfSize, *gfxCoreHelper.get());
|
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
|
||||||
|
generateLocalIDs(buffer, simd, localWorkSize, dimensionsOrder, true, grfSize, rootDeviceEnvironment);
|
||||||
}
|
}
|
||||||
void validateGRF() {
|
void validateGRF() {
|
||||||
uint32_t totalLocalIds = localWorkSize.at(0) * localWorkSize.at(1);
|
uint32_t totalLocalIds = localWorkSize.at(0) * localWorkSize.at(1);
|
||||||
|
@ -484,9 +492,10 @@ TEST_P(LocalIdsLayoutTest, givenLocalWorkgroupSize4x4x1WhenGenerateLocalIdsThenH
|
||||||
auto alignedMemory2 = allocateAlignedMemory(size, 32);
|
auto alignedMemory2 = allocateAlignedMemory(size, 32);
|
||||||
auto buffer2 = reinterpret_cast<uint16_t *>(alignedMemory2.get());
|
auto buffer2 = reinterpret_cast<uint16_t *>(alignedMemory2.get());
|
||||||
memset(buffer2, 0xff, size);
|
memset(buffer2, 0xff, size);
|
||||||
auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily);
|
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
|
||||||
generateLocalIDs(buffer1, simd, localWorkSize, dimensionsOrder, false, grfSize, *gfxCoreHelper.get());
|
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
|
||||||
generateLocalIDs(buffer2, simd, localWorkSize, dimensionsOrder, true, grfSize, *gfxCoreHelper.get());
|
generateLocalIDs(buffer1, simd, localWorkSize, dimensionsOrder, false, grfSize, rootDeviceEnvironment);
|
||||||
|
generateLocalIDs(buffer2, simd, localWorkSize, dimensionsOrder, true, grfSize, rootDeviceEnvironment);
|
||||||
|
|
||||||
for (auto i = 0u; i < elemsInBuffer / rowWidth; i++) {
|
for (auto i = 0u; i < elemsInBuffer / rowWidth; i++) {
|
||||||
for (auto j = 0u; j < rowWidth; j++) {
|
for (auto j = 0u; j < rowWidth; j++) {
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (C) 2022-2023 Intel Corporation
|
* Copyright (C) 2022-2024 Intel Corporation
|
||||||
*
|
*
|
||||||
* SPDX-License-Identifier: MIT
|
* SPDX-License-Identifier: MIT
|
||||||
*
|
*
|
||||||
|
@ -13,6 +13,7 @@
|
||||||
#include "shared/source/kernel/implicit_args_helper.h"
|
#include "shared/source/kernel/implicit_args_helper.h"
|
||||||
#include "shared/source/kernel/kernel_descriptor.h"
|
#include "shared/source/kernel/kernel_descriptor.h"
|
||||||
#include "shared/test/common/helpers/default_hw_info.h"
|
#include "shared/test/common/helpers/default_hw_info.h"
|
||||||
|
#include "shared/test/common/mocks/mock_execution_environment.h"
|
||||||
#include "shared/test/common/test_macros/hw_test.h"
|
#include "shared/test/common/test_macros/hw_test.h"
|
||||||
|
|
||||||
using namespace NEO;
|
using namespace NEO;
|
||||||
|
@ -57,8 +58,9 @@ TEST(ImplicitArgsHelperTest, givenSimdGreaterThanOneWhenGettingGrfSizeThenGrfSiz
|
||||||
TEST(ImplicitArgsHelperTest, givenNoImplicitArgsWhenGettingSizeForImplicitArgsProgrammingThenZeroIsReturned) {
|
TEST(ImplicitArgsHelperTest, givenNoImplicitArgsWhenGettingSizeForImplicitArgsProgrammingThenZeroIsReturned) {
|
||||||
|
|
||||||
KernelDescriptor kernelDescriptor{};
|
KernelDescriptor kernelDescriptor{};
|
||||||
auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily);
|
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
|
||||||
EXPECT_EQ(0u, ImplicitArgsHelper::getSizeForImplicitArgsPatching(nullptr, kernelDescriptor, false, *gfxCoreHelper.get()));
|
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
|
||||||
|
EXPECT_EQ(0u, ImplicitArgsHelper::getSizeForImplicitArgsPatching(nullptr, kernelDescriptor, false, rootDeviceEnvironment));
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInPayloadMappingWhenGettingSizeForImplicitArgsProgrammingThenCorrectSizeIsReturned) {
|
TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInPayloadMappingWhenGettingSizeForImplicitArgsProgrammingThenCorrectSizeIsReturned) {
|
||||||
|
@ -75,9 +77,10 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInP
|
||||||
|
|
||||||
auto totalWorkgroupSize = implicitArgs.localSizeX * implicitArgs.localSizeY * implicitArgs.localSizeZ;
|
auto totalWorkgroupSize = implicitArgs.localSizeX * implicitArgs.localSizeY * implicitArgs.localSizeZ;
|
||||||
|
|
||||||
auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily);
|
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
|
||||||
auto localIdsSize = alignUp(PerThreadDataHelper::getPerThreadDataSizeTotal(implicitArgs.simdWidth, 32u /* grfSize */, 3u /* num channels */, totalWorkgroupSize, false, *gfxCoreHelper.get()), MemoryConstants::cacheLineSize);
|
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
|
||||||
EXPECT_EQ(localIdsSize + ImplicitArgs::getSize(), ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, *gfxCoreHelper.get()));
|
auto localIdsSize = alignUp(PerThreadDataHelper::getPerThreadDataSizeTotal(implicitArgs.simdWidth, 32u /* grfSize */, 3u /* num channels */, totalWorkgroupSize, false, rootDeviceEnvironment), MemoryConstants::cacheLineSize);
|
||||||
|
EXPECT_EQ(localIdsSize + ImplicitArgs::getSize(), ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, rootDeviceEnvironment));
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(ImplicitArgsHelperTest, givenImplicitArgsWithImplicitArgsBufferOffsetInPayloadMappingWhenGettingSizeForImplicitArgsProgrammingThenCorrectSizeIsReturned) {
|
TEST(ImplicitArgsHelperTest, givenImplicitArgsWithImplicitArgsBufferOffsetInPayloadMappingWhenGettingSizeForImplicitArgsProgrammingThenCorrectSizeIsReturned) {
|
||||||
|
@ -91,8 +94,9 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithImplicitArgsBufferOffsetInPayl
|
||||||
implicitArgs.localSizeX = 2;
|
implicitArgs.localSizeX = 2;
|
||||||
implicitArgs.localSizeY = 3;
|
implicitArgs.localSizeY = 3;
|
||||||
implicitArgs.localSizeZ = 4;
|
implicitArgs.localSizeZ = 4;
|
||||||
auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily);
|
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
|
||||||
EXPECT_EQ(alignUp(implicitArgs.structSize, MemoryConstants::cacheLineSize), ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, *gfxCoreHelper.get()));
|
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
|
||||||
|
EXPECT_EQ(alignUp(implicitArgs.structSize, MemoryConstants::cacheLineSize), ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, rootDeviceEnvironment));
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInPayloadMappingWhenPatchingImplicitArgsThenOnlyProperRegionIsPatched) {
|
TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInPayloadMappingWhenPatchingImplicitArgsThenOnlyProperRegionIsPatched) {
|
||||||
|
@ -109,8 +113,9 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInP
|
||||||
implicitArgs.localSizeX = 2;
|
implicitArgs.localSizeX = 2;
|
||||||
implicitArgs.localSizeY = 3;
|
implicitArgs.localSizeY = 3;
|
||||||
implicitArgs.localSizeZ = 4;
|
implicitArgs.localSizeZ = 4;
|
||||||
auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily);
|
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
|
||||||
auto totalSizeForPatching = ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, *gfxCoreHelper.get());
|
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
|
||||||
|
auto totalSizeForPatching = ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, rootDeviceEnvironment);
|
||||||
|
|
||||||
auto totalWorkgroupSize = implicitArgs.localSizeX * implicitArgs.localSizeY * implicitArgs.localSizeZ;
|
auto totalWorkgroupSize = implicitArgs.localSizeX * implicitArgs.localSizeY * implicitArgs.localSizeZ;
|
||||||
auto localIdsPatchingSize = totalWorkgroupSize * 3 * sizeof(uint16_t);
|
auto localIdsPatchingSize = totalWorkgroupSize * 3 * sizeof(uint16_t);
|
||||||
|
@ -121,7 +126,7 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInP
|
||||||
|
|
||||||
memset(memoryToPatch.get(), pattern, totalSizeForPatching);
|
memset(memoryToPatch.get(), pattern, totalSizeForPatching);
|
||||||
|
|
||||||
auto retVal = ImplicitArgsHelper::patchImplicitArgs(memoryToPatch.get(), implicitArgs, kernelDescriptor, {}, *gfxCoreHelper.get());
|
auto retVal = ImplicitArgsHelper::patchImplicitArgs(memoryToPatch.get(), implicitArgs, kernelDescriptor, {}, rootDeviceEnvironment);
|
||||||
|
|
||||||
EXPECT_EQ(retVal, ptrOffset(memoryToPatch.get(), totalSizeForPatching));
|
EXPECT_EQ(retVal, ptrOffset(memoryToPatch.get(), totalSizeForPatching));
|
||||||
|
|
||||||
|
@ -151,8 +156,9 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithImplicitArgsBufferOffsetInPayl
|
||||||
implicitArgs.localSizeX = 2;
|
implicitArgs.localSizeX = 2;
|
||||||
implicitArgs.localSizeY = 3;
|
implicitArgs.localSizeY = 3;
|
||||||
implicitArgs.localSizeZ = 4;
|
implicitArgs.localSizeZ = 4;
|
||||||
auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily);
|
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
|
||||||
auto totalSizeForPatching = ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, *gfxCoreHelper.get());
|
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
|
||||||
|
auto totalSizeForPatching = ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, rootDeviceEnvironment);
|
||||||
|
|
||||||
EXPECT_EQ(alignUp(ImplicitArgs::getSize(), MemoryConstants::cacheLineSize), totalSizeForPatching);
|
EXPECT_EQ(alignUp(ImplicitArgs::getSize(), MemoryConstants::cacheLineSize), totalSizeForPatching);
|
||||||
|
|
||||||
|
@ -162,7 +168,7 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithImplicitArgsBufferOffsetInPayl
|
||||||
|
|
||||||
memset(memoryToPatch.get(), pattern, totalSizeForPatching);
|
memset(memoryToPatch.get(), pattern, totalSizeForPatching);
|
||||||
|
|
||||||
auto retVal = ImplicitArgsHelper::patchImplicitArgs(memoryToPatch.get(), implicitArgs, kernelDescriptor, {}, *gfxCoreHelper.get());
|
auto retVal = ImplicitArgsHelper::patchImplicitArgs(memoryToPatch.get(), implicitArgs, kernelDescriptor, {}, rootDeviceEnvironment);
|
||||||
|
|
||||||
EXPECT_EQ(retVal, ptrOffset(memoryToPatch.get(), totalSizeForPatching));
|
EXPECT_EQ(retVal, ptrOffset(memoryToPatch.get(), totalSizeForPatching));
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (C) 2022-2023 Intel Corporation
|
* Copyright (C) 2022-2024 Intel Corporation
|
||||||
*
|
*
|
||||||
* SPDX-License-Identifier: MIT
|
* SPDX-License-Identifier: MIT
|
||||||
*
|
*
|
||||||
|
@ -12,6 +12,7 @@
|
||||||
#include "shared/source/helpers/per_thread_data.h"
|
#include "shared/source/helpers/per_thread_data.h"
|
||||||
#include "shared/source/kernel/local_ids_cache.h"
|
#include "shared/source/kernel/local_ids_cache.h"
|
||||||
#include "shared/test/common/helpers/default_hw_info.h"
|
#include "shared/test/common/helpers/default_hw_info.h"
|
||||||
|
#include "shared/test/common/mocks/mock_execution_environment.h"
|
||||||
#include "shared/test/common/mocks/mock_graphics_allocation.h"
|
#include "shared/test/common/mocks/mock_graphics_allocation.h"
|
||||||
#include "shared/test/common/test_macros/test.h"
|
#include "shared/test/common/test_macros/test.h"
|
||||||
|
|
||||||
|
@ -38,8 +39,9 @@ using LocalIdsCacheTests = Test<LocalIdsCacheFixture>;
|
||||||
TEST_F(LocalIdsCacheTests, GivenCacheMissWhenGetLocalIdsForGroupThenNewEntryIsCommitedIntoLeastUsedEntry) {
|
TEST_F(LocalIdsCacheTests, GivenCacheMissWhenGetLocalIdsForGroupThenNewEntryIsCommitedIntoLeastUsedEntry) {
|
||||||
localIdsCache->cache.resize(2);
|
localIdsCache->cache.resize(2);
|
||||||
localIdsCache->cache[0].accessCounter = 2U;
|
localIdsCache->cache[0].accessCounter = 2U;
|
||||||
auto gfxCoreHelper = NEO::GfxCoreHelper::create(NEO::defaultHwInfo->platform.eRenderCoreFamily);
|
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
|
||||||
localIdsCache->setLocalIdsForGroup(groupSize, perThreadData.data(), *gfxCoreHelper.get());
|
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
|
||||||
|
localIdsCache->setLocalIdsForGroup(groupSize, perThreadData.data(), rootDeviceEnvironment);
|
||||||
|
|
||||||
EXPECT_EQ(groupSize, localIdsCache->cache[1].groupSize);
|
EXPECT_EQ(groupSize, localIdsCache->cache[1].groupSize);
|
||||||
EXPECT_NE(nullptr, localIdsCache->cache[1].localIdsData);
|
EXPECT_NE(nullptr, localIdsCache->cache[1].localIdsData);
|
||||||
|
@ -54,8 +56,9 @@ TEST_F(LocalIdsCacheTests, GivenEntryInCacheWhenGetLocalIdsForGroupThenEntryFrom
|
||||||
localIdsCache->cache[0].localIdsSize = 512U;
|
localIdsCache->cache[0].localIdsSize = 512U;
|
||||||
localIdsCache->cache[0].localIdsSizeAllocated = 512U;
|
localIdsCache->cache[0].localIdsSizeAllocated = 512U;
|
||||||
localIdsCache->cache[0].accessCounter = 1U;
|
localIdsCache->cache[0].accessCounter = 1U;
|
||||||
auto gfxCoreHelper = NEO::GfxCoreHelper::create(NEO::defaultHwInfo->platform.eRenderCoreFamily);
|
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
|
||||||
localIdsCache->setLocalIdsForGroup(groupSize, perThreadData.data(), *gfxCoreHelper.get());
|
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
|
||||||
|
localIdsCache->setLocalIdsForGroup(groupSize, perThreadData.data(), rootDeviceEnvironment);
|
||||||
EXPECT_EQ(2U, localIdsCache->cache[0].accessCounter);
|
EXPECT_EQ(2U, localIdsCache->cache[0].accessCounter);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -68,8 +71,9 @@ TEST_F(LocalIdsCacheTests, GivenEntryWithBiggerBufferAllocatedWhenGetLocalIdsFor
|
||||||
const auto localIdsData = localIdsCache->cache[0].localIdsData;
|
const auto localIdsData = localIdsCache->cache[0].localIdsData;
|
||||||
|
|
||||||
groupSize = {2, 1, 1};
|
groupSize = {2, 1, 1};
|
||||||
auto gfxCoreHelper = NEO::GfxCoreHelper::create(NEO::defaultHwInfo->platform.eRenderCoreFamily);
|
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
|
||||||
localIdsCache->setLocalIdsForGroup(groupSize, perThreadData.data(), *gfxCoreHelper.get());
|
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
|
||||||
|
localIdsCache->setLocalIdsForGroup(groupSize, perThreadData.data(), rootDeviceEnvironment);
|
||||||
EXPECT_EQ(1U, localIdsCache->cache[0].accessCounter);
|
EXPECT_EQ(1U, localIdsCache->cache[0].accessCounter);
|
||||||
EXPECT_EQ(192U, localIdsCache->cache[0].localIdsSize);
|
EXPECT_EQ(192U, localIdsCache->cache[0].localIdsSize);
|
||||||
EXPECT_EQ(512U, localIdsCache->cache[0].localIdsSizeAllocated);
|
EXPECT_EQ(512U, localIdsCache->cache[0].localIdsSizeAllocated);
|
||||||
|
@ -82,16 +86,18 @@ TEST_F(LocalIdsCacheTests, GivenValidLocalIdsCacheWhenGettingLocalIdsSizePerThre
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(LocalIdsCacheTests, GivenValidLocalIdsCacheWhenGettingLocalIdsSizeForGroupThenCorrectValueIsReturned) {
|
TEST_F(LocalIdsCacheTests, GivenValidLocalIdsCacheWhenGettingLocalIdsSizeForGroupThenCorrectValueIsReturned) {
|
||||||
auto gfxCoreHelper = NEO::GfxCoreHelper::create(NEO::defaultHwInfo->platform.eRenderCoreFamily);
|
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
|
||||||
auto localIdsSizePerThread = localIdsCache->getLocalIdsSizeForGroup(groupSize, *gfxCoreHelper.get());
|
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
|
||||||
|
auto localIdsSizePerThread = localIdsCache->getLocalIdsSizeForGroup(groupSize, rootDeviceEnvironment);
|
||||||
EXPECT_EQ(1536U, localIdsSizePerThread);
|
EXPECT_EQ(1536U, localIdsSizePerThread);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(LocalIdsCacheTest, givenSimd1WhenGettingLocalIdsSizeForGroupThenCorrectValueIsReturned) {
|
TEST(LocalIdsCacheTest, givenSimd1WhenGettingLocalIdsSizeForGroupThenCorrectValueIsReturned) {
|
||||||
auto gfxCoreHelper = NEO::GfxCoreHelper::create(NEO::defaultHwInfo->platform.eRenderCoreFamily);
|
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
|
||||||
|
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
|
||||||
auto localIdsCache = std::make_unique<MockLocalIdsCache>(1u, 1u);
|
auto localIdsCache = std::make_unique<MockLocalIdsCache>(1u, 1u);
|
||||||
Vec3<uint16_t> groupSize = {128, 2, 1};
|
Vec3<uint16_t> groupSize = {128, 2, 1};
|
||||||
auto localIdsSizePerThread = localIdsCache->getLocalIdsSizeForGroup(groupSize, *gfxCoreHelper.get());
|
auto localIdsSizePerThread = localIdsCache->getLocalIdsSizeForGroup(groupSize, rootDeviceEnvironment);
|
||||||
auto expectedLocalIdsSizePerThread = groupSize[0] * groupSize[1] * groupSize[2] * localIdsCache->getLocalIdsSizePerThread();
|
auto expectedLocalIdsSizePerThread = groupSize[0] * groupSize[1] * groupSize[2] * localIdsCache->getLocalIdsSizePerThread();
|
||||||
EXPECT_EQ(expectedLocalIdsSizePerThread, localIdsSizePerThread);
|
EXPECT_EQ(expectedLocalIdsSizePerThread, localIdsSizePerThread);
|
||||||
}
|
}
|
Loading…
Reference in New Issue