Revert "fix: Unify logic calculating threads per work group part 2"
This reverts commit 1e8a53bd53
.
Signed-off-by: Compute-Runtime-Validation <compute-runtime-validation@intel.com>
This commit is contained in:
parent
2d7505c01c
commit
39740da9d1
|
@ -363,7 +363,7 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
|
|||
static_cast<uint16_t>(groupSizeY),
|
||||
static_cast<uint16_t>(groupSizeZ)}},
|
||||
std::array<uint8_t, 3>{{0, 1, 2}},
|
||||
false, grfSize, gfxCoreHelper);
|
||||
false, grfSize);
|
||||
}
|
||||
|
||||
this->perThreadDataSize = perThreadDataSizeForWholeThreadGroup / numThreadsPerThreadGroup;
|
||||
|
|
|
@ -1023,8 +1023,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CmdlistAppendLaunchKernelWithImplicitArgsTests, giv
|
|||
|
||||
auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth);
|
||||
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - sizeof(ImplicitArgs), MemoryConstants::cacheLineSize);
|
||||
const auto &gfxCoreHelper = device->getNEODevice()->getGfxCoreHelper();
|
||||
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize, gfxCoreHelper);
|
||||
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize);
|
||||
|
||||
auto localIdsProgrammingSize = implicitArgsProgrammingSize - sizeof(ImplicitArgs);
|
||||
size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize);
|
||||
|
@ -1069,8 +1068,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CmdlistAppendLaunchKernelWithImplicitArgsTests, giv
|
|||
|
||||
auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth);
|
||||
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - sizeof(ImplicitArgs), MemoryConstants::cacheLineSize);
|
||||
const auto &gfxCoreHelper = device->getNEODevice()->getGfxCoreHelper();
|
||||
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize, gfxCoreHelper);
|
||||
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize);
|
||||
|
||||
auto localIdsProgrammingSize = implicitArgsProgrammingSize - sizeof(ImplicitArgs);
|
||||
size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize);
|
||||
|
|
|
@ -91,8 +91,7 @@ size_t HardwareCommandsHelper<GfxFamily>::sendCrossThreadData(
|
|||
|
||||
auto implicitArgsGpuVA = indirectHeap.getGraphicsAllocation()->getGpuAddress() + indirectHeap.getUsed();
|
||||
auto ptrToPatchImplicitArgs = indirectHeap.getSpace(sizeForImplicitArgsProgramming);
|
||||
const auto &gfxCoreHelper = kernel.getGfxCoreHelper();
|
||||
ImplicitArgsHelper::patchImplicitArgs(ptrToPatchImplicitArgs, *pImplicitArgs, kernelDescriptor, {}, gfxCoreHelper);
|
||||
ImplicitArgsHelper::patchImplicitArgs(ptrToPatchImplicitArgs, *pImplicitArgs, kernelDescriptor, {});
|
||||
|
||||
auto implicitArgsCrossThreadPtr = ptrOffset(reinterpret_cast<uint64_t *>(kernel.getCrossThreadData()), kernelDescriptor.payloadMappings.implicitArgs.implicitArgsBuffer);
|
||||
*implicitArgsCrossThreadPtr = implicitArgsGpuVA;
|
||||
|
|
|
@ -96,8 +96,8 @@ size_t HardwareCommandsHelper<GfxFamily>::sendCrossThreadData(
|
|||
kernelAttributes.flags.requiresWorkgroupWalkOrder,
|
||||
requiredWalkOrder,
|
||||
kernelDescriptor.kernelAttributes.simdSize);
|
||||
const auto &gfxCoreHelper = kernel.getGfxCoreHelper();
|
||||
ImplicitArgsHelper::patchImplicitArgs(ptrToPatchImplicitArgs, *pImplicitArgs, kernelDescriptor, std::make_pair(generationOfLocalIdsByRuntime, requiredWalkOrder), gfxCoreHelper);
|
||||
|
||||
ImplicitArgsHelper::patchImplicitArgs(ptrToPatchImplicitArgs, *pImplicitArgs, kernelDescriptor, std::make_pair(generationOfLocalIdsByRuntime, requiredWalkOrder));
|
||||
}
|
||||
|
||||
using InlineData = typename GfxFamily::INLINE_DATA;
|
||||
|
|
|
@ -581,7 +581,7 @@ cl_int Kernel::getWorkGroupInfo(cl_kernel_work_group_info paramName,
|
|||
cl_ulong privateMemSize;
|
||||
size_t maxWorkgroupSize;
|
||||
const auto &hwInfo = clDevice.getHardwareInfo();
|
||||
auto &gfxCoreHelper = this->getGfxCoreHelper();
|
||||
auto &gfxCoreHelper = clDevice.getGfxCoreHelper();
|
||||
auto &clGfxCoreHelper = clDevice.getRootDeviceEnvironment().getHelper<ClGfxCoreHelper>();
|
||||
GetInfoHelper info(paramValue, paramValueSize, paramValueSizeRet);
|
||||
|
||||
|
@ -1537,7 +1537,7 @@ cl_int Kernel::setArgBuffer(uint32_t argIndex,
|
|||
disableL3, isAuxTranslationKernel, arg.isReadOnly(), pClDevice->getDevice(),
|
||||
kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, areMultipleSubDevicesInContext());
|
||||
} else if (isValidOffset(argAsPtr.bindless)) {
|
||||
auto &gfxCoreHelper = this->getGfxCoreHelper();
|
||||
auto &gfxCoreHelper = getDevice().getGfxCoreHelper();
|
||||
auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize();
|
||||
auto surfaceState = ptrOffset(getSurfaceStateHeap(), surfaceStateSize * argIndex);
|
||||
|
||||
|
@ -1655,7 +1655,7 @@ cl_int Kernel::setArgImageWithMipLevel(uint32_t argIndex,
|
|||
|
||||
void *surfaceState = nullptr;
|
||||
if (isValidOffset(argAsImg.bindless)) {
|
||||
auto &gfxCoreHelper = this->getGfxCoreHelper();
|
||||
auto &gfxCoreHelper = getDevice().getGfxCoreHelper();
|
||||
auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize();
|
||||
surfaceState = ptrOffset(getSurfaceStateHeap(), surfaceStateSize * argIndex);
|
||||
} else {
|
||||
|
@ -2130,7 +2130,7 @@ uint64_t Kernel::getKernelStartAddress(const bool localIdsGenerationByRuntime, c
|
|||
kernelStartOffset += getStartOffset();
|
||||
|
||||
auto &hardwareInfo = getHardwareInfo();
|
||||
const auto &gfxCoreHelper = this->getGfxCoreHelper();
|
||||
const auto &gfxCoreHelper = getDevice().getGfxCoreHelper();
|
||||
const auto &productHelper = getDevice().getProductHelper();
|
||||
|
||||
if (isCssUsed && gfxCoreHelper.isOffsetToSkipSetFFIDGPWARequired(hardwareInfo, productHelper)) {
|
||||
|
@ -2140,7 +2140,7 @@ uint64_t Kernel::getKernelStartAddress(const bool localIdsGenerationByRuntime, c
|
|||
return kernelStartOffset;
|
||||
}
|
||||
void *Kernel::patchBindlessSurfaceState(NEO::GraphicsAllocation *alloc, uint32_t bindless) {
|
||||
auto &gfxCoreHelper = this->getGfxCoreHelper();
|
||||
auto &gfxCoreHelper = getDevice().getGfxCoreHelper();
|
||||
auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize();
|
||||
NEO::BindlessHeapsHelper *bindlessHeapsHelper = getDevice().getDevice().getBindlessHeapsHelper();
|
||||
auto ssInHeap = bindlessHeapsHelper->allocateSSInHeap(surfaceStateSize, alloc, NEO::BindlessHeapsHelper::GLOBAL_SSH);
|
||||
|
@ -2159,7 +2159,7 @@ uint32_t Kernel::getAdditionalKernelExecInfo() const {
|
|||
}
|
||||
|
||||
bool Kernel::requiresWaDisableRccRhwoOptimization() const {
|
||||
auto &gfxCoreHelper = this->getGfxCoreHelper();
|
||||
auto &gfxCoreHelper = getDevice().getGfxCoreHelper();
|
||||
auto rootDeviceIndex = getDevice().getRootDeviceIndex();
|
||||
|
||||
if (gfxCoreHelper.isWaDisableRccRhwoOptimizationRequired() && isUsingSharedObjArgs()) {
|
||||
|
@ -2270,7 +2270,7 @@ void Kernel::reconfigureKernel() {
|
|||
kernelDescriptor.kernelAttributes.simdSize != 32) {
|
||||
this->maxKernelWorkGroupSize >>= 1;
|
||||
}
|
||||
const auto &gfxCoreHelper = this->getGfxCoreHelper();
|
||||
const auto &gfxCoreHelper = getDevice().getGfxCoreHelper();
|
||||
bool isLocalIdsGeneratedByHw = false; // if local ids generated by runtime then more work groups available
|
||||
auto maxWorkGroupSize = static_cast<uint32_t>(kernelInfo.getMaxRequiredWorkGroupSize(getMaxKernelWorkGroupSize()));
|
||||
|
||||
|
@ -2369,8 +2369,7 @@ void Kernel::initializeLocalIdsCache() {
|
|||
|
||||
void Kernel::setLocalIdsForGroup(const Vec3<uint16_t> &groupSize, void *destination) const {
|
||||
UNRECOVERABLE_IF(localIdsCache.get() == nullptr);
|
||||
const auto &gfxCoreHelper = this->getGfxCoreHelper();
|
||||
localIdsCache->setLocalIdsForGroup(groupSize, destination, gfxCoreHelper);
|
||||
localIdsCache->setLocalIdsForGroup(groupSize, destination);
|
||||
}
|
||||
|
||||
size_t Kernel::getLocalIdsSizeForGroup(const Vec3<uint16_t> &groupSize) const {
|
||||
|
|
|
@ -410,10 +410,6 @@ class Kernel : public ReferenceTrackedObject<Kernel> {
|
|||
size_t getLocalIdsSizeForGroup(const Vec3<uint16_t> &groupSize) const;
|
||||
size_t getLocalIdsSizePerThread() const;
|
||||
|
||||
const GfxCoreHelper &getGfxCoreHelper() const {
|
||||
return getDevice().getGfxCoreHelper();
|
||||
}
|
||||
|
||||
protected:
|
||||
struct KernelConfig {
|
||||
Vec3<size_t> gws;
|
||||
|
|
|
@ -552,13 +552,12 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, whenSendingIndirectStateThenKe
|
|||
ASSERT_LE(expectedIohSize, ioh.getUsed());
|
||||
|
||||
auto expectedLocalIds = alignedMalloc(expectedIohSize, 64);
|
||||
const auto &gfxCoreHelper = pDevice->getGfxCoreHelper();
|
||||
generateLocalIDs(expectedLocalIds, modifiedKernelInfo.getMaxSimdSize(),
|
||||
std::array<uint16_t, 3>{{localWorkSizeX, localWorkSizeY, localWorkSizeZ}},
|
||||
std::array<uint8_t, 3>{{modifiedKernelInfo.kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[0],
|
||||
modifiedKernelInfo.kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[1],
|
||||
modifiedKernelInfo.kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[2]}},
|
||||
false, grfSize, gfxCoreHelper);
|
||||
false, grfSize);
|
||||
|
||||
EXPECT_EQ(0, memcmp(expectedLocalIds, ioh.getCpuBase(), expectedIohSize));
|
||||
alignedFree(expectedLocalIds);
|
||||
|
@ -1378,8 +1377,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithI
|
|||
|
||||
auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth);
|
||||
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - sizeof(ImplicitArgs), MemoryConstants::cacheLineSize);
|
||||
const auto &gfxCoreHelper = pDevice->getGfxCoreHelper();
|
||||
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize, gfxCoreHelper);
|
||||
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize);
|
||||
|
||||
auto localIdsProgrammingSize = implicitArgsProgrammingSize - sizeof(ImplicitArgs);
|
||||
size_t sizeForLocalIds = PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize);
|
||||
|
@ -1412,8 +1410,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithI
|
|||
|
||||
auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth);
|
||||
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - sizeof(ImplicitArgs), MemoryConstants::cacheLineSize);
|
||||
const auto &gfxCoreHelper = pDevice->getGfxCoreHelper();
|
||||
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize, gfxCoreHelper);
|
||||
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize);
|
||||
|
||||
auto localIdsProgrammingSize = implicitArgsProgrammingSize - sizeof(ImplicitArgs);
|
||||
size_t sizeForLocalIds = PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize);
|
||||
|
|
|
@ -183,7 +183,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
|||
auto implicitArgsCrossThreadPtr = ptrOffset(const_cast<uint64_t *>(reinterpret_cast<const uint64_t *>(args.dispatchInterface->getCrossThreadData())), kernelDescriptor.payloadMappings.implicitArgs.implicitArgsBuffer);
|
||||
*implicitArgsCrossThreadPtr = implicitArgsGpuVA;
|
||||
|
||||
ptr = NEO::ImplicitArgsHelper::patchImplicitArgs(ptr, *pImplicitArgs, kernelDescriptor, {}, gfxCoreHelper);
|
||||
ptr = NEO::ImplicitArgsHelper::patchImplicitArgs(ptr, *pImplicitArgs, kernelDescriptor, {});
|
||||
}
|
||||
|
||||
memcpy_s(ptr, sizeCrossThreadData,
|
||||
|
|
|
@ -218,7 +218,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
|||
if (pImplicitArgs) {
|
||||
offsetThreadData -= sizeof(ImplicitArgs);
|
||||
pImplicitArgs->localIdTablePtr = heap->getGraphicsAllocation()->getGpuAddress() + heap->getUsed() - iohRequiredSize;
|
||||
ptr = NEO::ImplicitArgsHelper::patchImplicitArgs(ptr, *pImplicitArgs, kernelDescriptor, std::make_pair(localIdsGenerationByRuntime, requiredWorkgroupOrder), gfxCoreHelper);
|
||||
ptr = NEO::ImplicitArgsHelper::patchImplicitArgs(ptr, *pImplicitArgs, kernelDescriptor, std::make_pair(localIdsGenerationByRuntime, requiredWorkgroupOrder));
|
||||
}
|
||||
|
||||
if (sizeCrossThreadData > 0) {
|
||||
|
|
|
@ -8,7 +8,6 @@
|
|||
#include "shared/source/helpers/local_id_gen.h"
|
||||
|
||||
#include "shared/source/helpers/aligned_memory.h"
|
||||
#include "shared/source/helpers/gfx_core_helper.h"
|
||||
#include "shared/source/helpers/local_id_gen_special.inl"
|
||||
#include "shared/source/utilities/cpu_info.h"
|
||||
|
||||
|
@ -42,9 +41,8 @@ LocalIDHelper::LocalIDHelper() {
|
|||
|
||||
LocalIDHelper LocalIDHelper::initializer;
|
||||
|
||||
void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize, const GfxCoreHelper &gfxCoreHelper) {
|
||||
bool localIdsGeneratedByHw = false;
|
||||
auto threadsPerWorkGroup = static_cast<uint16_t>(gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]), grfSize, localIdsGeneratedByHw));
|
||||
void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize) {
|
||||
auto threadsPerWorkGroup = static_cast<uint16_t>(getThreadsPerWG(simd, static_cast<uint32_t>(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2])));
|
||||
bool useLayoutForImages = isImageOnlyKernel && isCompatibleWithLayoutForImages(localWorkgroupSize, dimensionsOrder, simd);
|
||||
if (useLayoutForImages) {
|
||||
generateLocalIDsWithLayoutForImages(buffer, localWorkgroupSize, simd);
|
||||
|
|
|
@ -12,7 +12,6 @@
|
|||
#include <cstdint>
|
||||
|
||||
namespace NEO {
|
||||
class GfxCoreHelper;
|
||||
inline uint32_t getGRFsPerThread(uint32_t simd, uint32_t grfSize) {
|
||||
return (simd == 32 && grfSize == 32) ? 2 : 1;
|
||||
}
|
||||
|
@ -59,7 +58,7 @@ void generateLocalIDsSimd(void *b, const std::array<uint16_t, 3> &localWorkgroup
|
|||
const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize);
|
||||
|
||||
void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize,
|
||||
const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize, const NEO::GfxCoreHelper &gfxCoreHelper);
|
||||
const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize);
|
||||
void generateLocalIDsWithLayoutForImages(void *b, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t simd);
|
||||
|
||||
bool isCompatibleWithLayoutForImages(const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, uint16_t simd);
|
||||
|
|
|
@ -8,7 +8,6 @@
|
|||
#include "shared/source/helpers/local_id_gen.h"
|
||||
|
||||
#include "shared/source/helpers/aligned_memory.h"
|
||||
#include "shared/source/helpers/gfx_core_helper.h"
|
||||
#include "shared/source/helpers/local_id_gen_special.inl"
|
||||
#include "shared/source/utilities/cpu_info.h"
|
||||
|
||||
|
@ -45,9 +44,8 @@ LocalIDHelper::LocalIDHelper() {
|
|||
LocalIDHelper LocalIDHelper::initializer;
|
||||
|
||||
// traditional function to generate local IDs
|
||||
void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize, const GfxCoreHelper &gfxCoreHelper) {
|
||||
bool localIdsGeneratedByHw = false;
|
||||
auto threadsPerWorkGroup = static_cast<uint16_t>(gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]), grfSize, localIdsGeneratedByHw));
|
||||
void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize) {
|
||||
auto threadsPerWorkGroup = static_cast<uint16_t>(getThreadsPerWG(simd, static_cast<uint32_t>(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2])));
|
||||
bool useLayoutForImages = isImageOnlyKernel && isCompatibleWithLayoutForImages(localWorkgroupSize, dimensionsOrder, simd);
|
||||
if (useLayoutForImages) {
|
||||
generateLocalIDsWithLayoutForImages(buffer, localWorkgroupSize, simd);
|
||||
|
|
|
@ -15,7 +15,6 @@
|
|||
namespace NEO {
|
||||
|
||||
struct KernelDescriptor;
|
||||
class GfxCoreHelper;
|
||||
|
||||
struct ImplicitArgs {
|
||||
uint8_t structSize;
|
||||
|
@ -52,6 +51,6 @@ namespace ImplicitArgsHelper {
|
|||
std::array<uint8_t, 3> getDimensionOrderForLocalIds(const uint8_t *workgroupDimensionsOrder, std::optional<std::pair<bool /* localIdsGeneratedByRuntime */, uint32_t /* walkOrderForHwGenerationOfLocalIds */>> hwGenerationOfLocalIdsParams);
|
||||
uint32_t getGrfSize(uint32_t simd);
|
||||
uint32_t getSizeForImplicitArgsPatching(const ImplicitArgs *pImplicitArgs, const KernelDescriptor &kernelDescriptor);
|
||||
void *patchImplicitArgs(void *ptrToPatch, const ImplicitArgs &implicitArgs, const KernelDescriptor &kernelDescriptor, std::optional<std::pair<bool /* localIdsGeneratedByRuntime */, uint32_t /* walkOrderForHwGenerationOfLocalIds */>> hwGenerationOfLocalIdsParams, const GfxCoreHelper &gfxCoreHelper);
|
||||
void *patchImplicitArgs(void *ptrToPatch, const ImplicitArgs &implicitArgs, const KernelDescriptor &kernelDescriptor, std::optional<std::pair<bool /* localIdsGeneratedByRuntime */, uint32_t /* walkOrderForHwGenerationOfLocalIds */>> hwGenerationOfLocalIdsParams);
|
||||
} // namespace ImplicitArgsHelper
|
||||
} // namespace NEO
|
||||
|
|
|
@ -62,7 +62,7 @@ uint32_t getSizeForImplicitArgsPatching(const ImplicitArgs *pImplicitArgs, const
|
|||
}
|
||||
}
|
||||
|
||||
void *patchImplicitArgs(void *ptrToPatch, const ImplicitArgs &implicitArgs, const KernelDescriptor &kernelDescriptor, std::optional<std::pair<bool, uint32_t>> hwGenerationOfLocalIdsParams, const GfxCoreHelper &gfxCoreHelper) {
|
||||
void *patchImplicitArgs(void *ptrToPatch, const ImplicitArgs &implicitArgs, const KernelDescriptor &kernelDescriptor, std::optional<std::pair<bool, uint32_t>> hwGenerationOfLocalIdsParams) {
|
||||
|
||||
auto totalSizeToProgram = getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor);
|
||||
auto retVal = ptrOffset(ptrToPatch, totalSizeToProgram);
|
||||
|
@ -80,7 +80,7 @@ void *patchImplicitArgs(void *ptrToPatch, const ImplicitArgs &implicitArgs, cons
|
|||
static_cast<uint16_t>(implicitArgs.localSizeY),
|
||||
static_cast<uint16_t>(implicitArgs.localSizeZ)}},
|
||||
dimensionOrder,
|
||||
false, grfSize, gfxCoreHelper);
|
||||
false, grfSize);
|
||||
auto sizeForLocalIdsProgramming = totalSizeToProgram - sizeof(NEO::ImplicitArgs);
|
||||
ptrToPatch = ptrOffset(ptrToPatch, sizeForLocalIdsProgramming);
|
||||
}
|
||||
|
|
|
@ -47,7 +47,7 @@ void LocalIdsCache::setLocalIdsForEntry(LocalIdsCacheEntry &entry, void *destina
|
|||
std::memcpy(destination, entry.localIdsData, entry.localIdsSize);
|
||||
}
|
||||
|
||||
void LocalIdsCache::setLocalIdsForGroup(const Vec3<uint16_t> &group, void *destination, const GfxCoreHelper &gfxCoreHelper) {
|
||||
void LocalIdsCache::setLocalIdsForGroup(const Vec3<uint16_t> &group, void *destination) {
|
||||
auto setLocalIdsLock = lock();
|
||||
LocalIdsCacheEntry *leastAccessedEntry = &cache[0];
|
||||
for (auto &cacheEntry : cache) {
|
||||
|
@ -60,11 +60,11 @@ void LocalIdsCache::setLocalIdsForGroup(const Vec3<uint16_t> &group, void *desti
|
|||
}
|
||||
}
|
||||
|
||||
commitNewEntry(*leastAccessedEntry, group, gfxCoreHelper);
|
||||
commitNewEntry(*leastAccessedEntry, group);
|
||||
setLocalIdsForEntry(*leastAccessedEntry, destination);
|
||||
}
|
||||
|
||||
void LocalIdsCache::commitNewEntry(LocalIdsCacheEntry &entry, const Vec3<uint16_t> &group, const GfxCoreHelper &gfxCoreHelper) {
|
||||
void LocalIdsCache::commitNewEntry(LocalIdsCacheEntry &entry, const Vec3<uint16_t> &group) {
|
||||
entry.localIdsSize = getLocalIdsSizeForGroup(group);
|
||||
entry.groupSize = group;
|
||||
entry.accessCounter = 0U;
|
||||
|
@ -74,7 +74,7 @@ void LocalIdsCache::commitNewEntry(LocalIdsCacheEntry &entry, const Vec3<uint16_
|
|||
entry.localIdsSizeAllocated = entry.localIdsSize;
|
||||
}
|
||||
NEO::generateLocalIDs(entry.localIdsData, static_cast<uint16_t>(simdSize),
|
||||
{group[0], group[1], group[2]}, wgDimOrder, usesOnlyImages, grfSize, gfxCoreHelper);
|
||||
{group[0], group[1], group[2]}, wgDimOrder, usesOnlyImages, grfSize);
|
||||
}
|
||||
|
||||
} // namespace NEO
|
|
@ -12,7 +12,7 @@
|
|||
#include <mutex>
|
||||
|
||||
namespace NEO {
|
||||
class GfxCoreHelper;
|
||||
|
||||
class LocalIdsCache {
|
||||
public:
|
||||
struct LocalIdsCacheEntry {
|
||||
|
@ -30,13 +30,13 @@ class LocalIdsCache {
|
|||
LocalIdsCache(size_t cacheSize, std::array<uint8_t, 3> wgDimOrder, uint8_t simdSize, uint8_t grfSize, bool usesOnlyImages = false);
|
||||
~LocalIdsCache();
|
||||
|
||||
void setLocalIdsForGroup(const Vec3<uint16_t> &group, void *destination, const GfxCoreHelper &gfxCoreHelper);
|
||||
void setLocalIdsForGroup(const Vec3<uint16_t> &group, void *destination);
|
||||
size_t getLocalIdsSizeForGroup(const Vec3<uint16_t> &group) const;
|
||||
size_t getLocalIdsSizePerThread() const;
|
||||
|
||||
protected:
|
||||
void setLocalIdsForEntry(LocalIdsCacheEntry &entry, void *destination);
|
||||
void commitNewEntry(LocalIdsCacheEntry &entry, const Vec3<uint16_t> &group, const GfxCoreHelper &gfxCoreHelper);
|
||||
void commitNewEntry(LocalIdsCacheEntry &entry, const Vec3<uint16_t> &group);
|
||||
std::unique_lock<std::mutex> lock();
|
||||
|
||||
StackVec<LocalIdsCacheEntry, 4> cache;
|
||||
|
|
|
@ -7,10 +7,8 @@
|
|||
|
||||
#include "shared/source/helpers/aligned_memory.h"
|
||||
#include "shared/source/helpers/basic_math.h"
|
||||
#include "shared/source/helpers/gfx_core_helper.h"
|
||||
#include "shared/source/helpers/local_id_gen.h"
|
||||
#include "shared/source/helpers/ptr_math.h"
|
||||
#include "shared/test/common/helpers/default_hw_info.h"
|
||||
#include "shared/test/common/helpers/unit_test_helper.h"
|
||||
#include "shared/test/common/test_macros/hw_test.h"
|
||||
|
||||
|
@ -77,16 +75,14 @@ TEST(LocalID, GivenSimd1WhenGettingPerThreadSizeLocalIdsThenValueIsEqualGrfSize)
|
|||
|
||||
EXPECT_EQ(grfSize, getPerThreadSizeLocalIDs(simd, grfSize));
|
||||
}
|
||||
|
||||
TEST(LocalIdTest, givenVariadicGrfSizeWhenLocalSizesAreEmittedThenUseFullRowSize) {
|
||||
TEST(LocalID, givenVariadicGrfSizeWhenLocalSizesAreEmittedThenUseFullRowSize) {
|
||||
auto localIdsPtr = allocateAlignedMemory(3 * 64u, MemoryConstants::cacheLineSize);
|
||||
|
||||
uint16_t *localIdsView = reinterpret_cast<uint16_t *>(localIdsPtr.get());
|
||||
std::array<uint16_t, 3u> localSizes = {{2u, 2u, 1u}};
|
||||
std::array<uint8_t, 3u> dimensionsOrder = {{0u, 1u, 2u}};
|
||||
|
||||
auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily);
|
||||
generateLocalIDs(localIdsPtr.get(), 16u, localSizes, dimensionsOrder, false, 64u, *gfxCoreHelper.get());
|
||||
generateLocalIDs(localIdsPtr.get(), 16u, localSizes, dimensionsOrder, false, 64u);
|
||||
EXPECT_EQ(localIdsView[0], 0u);
|
||||
EXPECT_EQ(localIdsView[1], 1u);
|
||||
EXPECT_EQ(localIdsView[2], 0u);
|
||||
|
@ -281,42 +277,37 @@ struct LocalIDFixture : ::testing::TestWithParam<std::tuple<int, int, int, int,
|
|||
};
|
||||
|
||||
HWTEST_P(LocalIDFixture, WhenGeneratingLocalIdsThenIdsAreWithinLimits) {
|
||||
auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily);
|
||||
generateLocalIDs(buffer, simd, std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizeX), static_cast<uint16_t>(localWorkSizeY), static_cast<uint16_t>(localWorkSizeZ)}},
|
||||
std::array<uint8_t, 3>{{0, 1, 2}}, false, grfSize, *gfxCoreHelper.get());
|
||||
std::array<uint8_t, 3>{{0, 1, 2}}, false, grfSize);
|
||||
validateIDWithinLimits(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, UnitTestHelper<FamilyType>::useFullRowForLocalIdsGeneration);
|
||||
}
|
||||
|
||||
HWTEST_P(LocalIDFixture, WhenGeneratingLocalIdsThenAllWorkItemsCovered) {
|
||||
auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily);
|
||||
generateLocalIDs(buffer, simd, std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizeX), static_cast<uint16_t>(localWorkSizeY), static_cast<uint16_t>(localWorkSizeZ)}},
|
||||
std::array<uint8_t, 3>{{0, 1, 2}}, false, grfSize, *gfxCoreHelper.get());
|
||||
std::array<uint8_t, 3>{{0, 1, 2}}, false, grfSize);
|
||||
validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, UnitTestHelper<FamilyType>::useFullRowForLocalIdsGeneration);
|
||||
}
|
||||
|
||||
HWTEST_P(LocalIDFixture, WhenWalkOrderIsXyzThenProperLocalIdsAreGenerated) {
|
||||
auto dimensionsOrder = std::array<uint8_t, 3>{{0, 1, 2}};
|
||||
auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily);
|
||||
generateLocalIDs(buffer, simd, std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizeX), static_cast<uint16_t>(localWorkSizeY), static_cast<uint16_t>(localWorkSizeZ)}},
|
||||
dimensionsOrder, false, grfSize, *gfxCoreHelper.get());
|
||||
dimensionsOrder, false, grfSize);
|
||||
validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, UnitTestHelper<FamilyType>::useFullRowForLocalIdsGeneration);
|
||||
validateWalkOrder(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, dimensionsOrder);
|
||||
}
|
||||
|
||||
HWTEST_P(LocalIDFixture, WhenWalkOrderIsYxzThenProperLocalIdsAreGenerated) {
|
||||
auto dimensionsOrder = std::array<uint8_t, 3>{{1, 0, 2}};
|
||||
auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily);
|
||||
generateLocalIDs(buffer, simd, std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizeX), static_cast<uint16_t>(localWorkSizeY), static_cast<uint16_t>(localWorkSizeZ)}},
|
||||
dimensionsOrder, false, grfSize, *gfxCoreHelper.get());
|
||||
dimensionsOrder, false, grfSize);
|
||||
validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, UnitTestHelper<FamilyType>::useFullRowForLocalIdsGeneration);
|
||||
validateWalkOrder(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, dimensionsOrder);
|
||||
}
|
||||
|
||||
HWTEST_P(LocalIDFixture, WhenWalkOrderIsZyxThenProperLocalIdsAreGenerated) {
|
||||
auto dimensionsOrder = std::array<uint8_t, 3>{{2, 1, 0}};
|
||||
auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily);
|
||||
generateLocalIDs(buffer, simd, std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizeX), static_cast<uint16_t>(localWorkSizeY), static_cast<uint16_t>(localWorkSizeZ)}},
|
||||
dimensionsOrder, false, grfSize, *gfxCoreHelper.get());
|
||||
dimensionsOrder, false, grfSize);
|
||||
validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, UnitTestHelper<FamilyType>::useFullRowForLocalIdsGeneration);
|
||||
validateWalkOrder(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, dimensionsOrder);
|
||||
}
|
||||
|
@ -344,8 +335,8 @@ struct LocalIdsLayoutForImagesTest : ::testing::TestWithParam<std::tuple<uint16_
|
|||
rowWidth = simd == 32u ? 32u : 16u;
|
||||
xDelta = simd == 8u ? 2u : 4u;
|
||||
}
|
||||
|
||||
void generateLocalIds() {
|
||||
|
||||
auto numGrfs = (localWorkSize.at(0) * localWorkSize.at(1) + (simd - 1)) / simd;
|
||||
elemsInBuffer = 3u * simd * numGrfs;
|
||||
if (simd == 8u) {
|
||||
|
@ -356,8 +347,7 @@ struct LocalIdsLayoutForImagesTest : ::testing::TestWithParam<std::tuple<uint16_
|
|||
memset(memory.get(), 0xff, size);
|
||||
buffer = reinterpret_cast<uint16_t *>(memory.get());
|
||||
EXPECT_TRUE(isCompatibleWithLayoutForImages(localWorkSize, dimensionsOrder, simd));
|
||||
auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily);
|
||||
generateLocalIDs(buffer, simd, localWorkSize, dimensionsOrder, true, grfSize, *gfxCoreHelper.get());
|
||||
generateLocalIDs(buffer, simd, localWorkSize, dimensionsOrder, true, grfSize);
|
||||
}
|
||||
void validateGRF() {
|
||||
uint32_t totalLocalIds = localWorkSize.at(0) * localWorkSize.at(1);
|
||||
|
@ -457,9 +447,9 @@ TEST_P(LocalIdsLayoutTest, givenLocalWorkgroupSize4x4x1WhenGenerateLocalIdsThenH
|
|||
auto alignedMemory2 = allocateAlignedMemory(size, 32);
|
||||
auto buffer2 = reinterpret_cast<uint16_t *>(alignedMemory2.get());
|
||||
memset(buffer2, 0xff, size);
|
||||
auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily);
|
||||
generateLocalIDs(buffer1, simd, localWorkSize, dimensionsOrder, false, grfSize, *gfxCoreHelper.get());
|
||||
generateLocalIDs(buffer2, simd, localWorkSize, dimensionsOrder, true, grfSize, *gfxCoreHelper.get());
|
||||
|
||||
generateLocalIDs(buffer1, simd, localWorkSize, dimensionsOrder, false, grfSize);
|
||||
generateLocalIDs(buffer2, simd, localWorkSize, dimensionsOrder, true, grfSize);
|
||||
|
||||
for (auto i = 0u; i < elemsInBuffer / rowWidth; i++) {
|
||||
for (auto j = 0u; j < rowWidth; j++) {
|
||||
|
|
|
@ -6,13 +6,11 @@
|
|||
*/
|
||||
|
||||
#include "shared/source/helpers/aligned_memory.h"
|
||||
#include "shared/source/helpers/gfx_core_helper.h"
|
||||
#include "shared/source/helpers/hw_walk_order.h"
|
||||
#include "shared/source/helpers/per_thread_data.h"
|
||||
#include "shared/source/helpers/ptr_math.h"
|
||||
#include "shared/source/kernel/implicit_args.h"
|
||||
#include "shared/source/kernel/kernel_descriptor.h"
|
||||
#include "shared/test/common/helpers/default_hw_info.h"
|
||||
#include "shared/test/common/test_macros/hw_test.h"
|
||||
|
||||
using namespace NEO;
|
||||
|
@ -119,8 +117,8 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInP
|
|||
uint8_t pattern = 0xcd;
|
||||
|
||||
memset(memoryToPatch.get(), pattern, totalSizeForPatching);
|
||||
auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily);
|
||||
auto retVal = ImplicitArgsHelper::patchImplicitArgs(memoryToPatch.get(), implicitArgs, kernelDescriptor, {}, *gfxCoreHelper.get());
|
||||
|
||||
auto retVal = ImplicitArgsHelper::patchImplicitArgs(memoryToPatch.get(), implicitArgs, kernelDescriptor, {});
|
||||
|
||||
EXPECT_EQ(retVal, ptrOffset(memoryToPatch.get(), totalSizeForPatching));
|
||||
|
||||
|
@ -160,8 +158,8 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithImplicitArgsBufferOffsetInPayl
|
|||
uint8_t pattern = 0xcd;
|
||||
|
||||
memset(memoryToPatch.get(), pattern, totalSizeForPatching);
|
||||
auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily);
|
||||
auto retVal = ImplicitArgsHelper::patchImplicitArgs(memoryToPatch.get(), implicitArgs, kernelDescriptor, {}, *gfxCoreHelper.get());
|
||||
|
||||
auto retVal = ImplicitArgsHelper::patchImplicitArgs(memoryToPatch.get(), implicitArgs, kernelDescriptor, {});
|
||||
|
||||
EXPECT_EQ(retVal, ptrOffset(memoryToPatch.get(), totalSizeForPatching));
|
||||
|
||||
|
|
|
@ -7,11 +7,8 @@
|
|||
|
||||
#include "shared/source/command_stream/linear_stream.h"
|
||||
#include "shared/source/helpers/aligned_memory.h"
|
||||
#include "shared/source/helpers/gfx_core_helper.h"
|
||||
#include "shared/source/helpers/hw_info.h"
|
||||
#include "shared/source/helpers/per_thread_data.h"
|
||||
#include "shared/source/kernel/local_ids_cache.h"
|
||||
#include "shared/test/common/helpers/default_hw_info.h"
|
||||
#include "shared/test/common/mocks/mock_graphics_allocation.h"
|
||||
#include "shared/test/common/test_macros/test.h"
|
||||
|
||||
|
@ -38,8 +35,7 @@ using LocalIdsCacheTest = Test<LocalIdsCacheFixture>;
|
|||
TEST_F(LocalIdsCacheTest, GivenCacheMissWhenGetLocalIdsForGroupThenNewEntryIsCommitedIntoLeastUsedEntry) {
|
||||
localIdsCache->cache.resize(2);
|
||||
localIdsCache->cache[0].accessCounter = 2U;
|
||||
auto gfxCoreHelper = NEO::GfxCoreHelper::create(NEO::defaultHwInfo->platform.eRenderCoreFamily);
|
||||
localIdsCache->setLocalIdsForGroup(groupSize, perThreadData.data(), *gfxCoreHelper.get());
|
||||
localIdsCache->setLocalIdsForGroup(groupSize, perThreadData.data());
|
||||
|
||||
EXPECT_EQ(groupSize, localIdsCache->cache[1].groupSize);
|
||||
EXPECT_NE(nullptr, localIdsCache->cache[1].localIdsData);
|
||||
|
@ -54,8 +50,7 @@ TEST_F(LocalIdsCacheTest, GivenEntryInCacheWhenGetLocalIdsForGroupThenEntryFromC
|
|||
localIdsCache->cache[0].localIdsSize = 512U;
|
||||
localIdsCache->cache[0].localIdsSizeAllocated = 512U;
|
||||
localIdsCache->cache[0].accessCounter = 1U;
|
||||
auto gfxCoreHelper = NEO::GfxCoreHelper::create(NEO::defaultHwInfo->platform.eRenderCoreFamily);
|
||||
localIdsCache->setLocalIdsForGroup(groupSize, perThreadData.data(), *gfxCoreHelper.get());
|
||||
localIdsCache->setLocalIdsForGroup(groupSize, perThreadData.data());
|
||||
EXPECT_EQ(2U, localIdsCache->cache[0].accessCounter);
|
||||
}
|
||||
|
||||
|
@ -68,8 +63,7 @@ TEST_F(LocalIdsCacheTest, GivenEntryWithBiggerBufferAllocatedWhenGetLocalIdsForG
|
|||
const auto localIdsData = localIdsCache->cache[0].localIdsData;
|
||||
|
||||
groupSize = {2, 1, 1};
|
||||
auto gfxCoreHelper = NEO::GfxCoreHelper::create(NEO::defaultHwInfo->platform.eRenderCoreFamily);
|
||||
localIdsCache->setLocalIdsForGroup(groupSize, perThreadData.data(), *gfxCoreHelper.get());
|
||||
localIdsCache->setLocalIdsForGroup(groupSize, perThreadData.data());
|
||||
EXPECT_EQ(1U, localIdsCache->cache[0].accessCounter);
|
||||
EXPECT_EQ(192U, localIdsCache->cache[0].localIdsSize);
|
||||
EXPECT_EQ(512U, localIdsCache->cache[0].localIdsSizeAllocated);
|
||||
|
|
Loading…
Reference in New Issue