mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-25 21:42:53 +08:00
Adjust minWorkGroupSize when EUFusion is enabled
Related-To: NEO-5260 Co-authored-by: Bartlomiej Wolny <bartlomiej.wolny@intel.com> Signed-off-by: Maciej Dziuban <maciej.dziuban@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
3c2a7ee49a
commit
b98148fb3f
@@ -366,13 +366,13 @@ ze_result_t KernelImp::suggestGroupSize(uint32_t globalSizeX, uint32_t globalSiz
|
||||
|
||||
if (NEO::DebugManager.flags.EnableComputeWorkSizeND.get()) {
|
||||
auto usesImages = getImmutableData()->getDescriptor().kernelAttributes.flags.usesImages;
|
||||
auto coreFamily = module->getDevice()->getNEODevice()->getHardwareInfo().platform.eRenderCoreFamily;
|
||||
const auto hwInfo = &module->getDevice()->getNEODevice()->getHardwareInfo();
|
||||
const auto &deviceInfo = module->getDevice()->getNEODevice()->getDeviceInfo();
|
||||
uint32_t numThreadsPerSubSlice = (uint32_t)deviceInfo.maxNumEUsPerSubSlice * deviceInfo.numThreadsPerEU;
|
||||
uint32_t localMemSize = (uint32_t)deviceInfo.localMemSize;
|
||||
|
||||
NEO::WorkSizeInfo wsInfo(maxWorkGroupSize, kernelImmData->getDescriptor().kernelAttributes.usesBarriers(), simd, this->getSlmTotalSize(),
|
||||
coreFamily, numThreadsPerSubSlice, localMemSize,
|
||||
hwInfo, numThreadsPerSubSlice, localMemSize,
|
||||
usesImages, false);
|
||||
NEO::computeWorkgroupSizeND(wsInfo, retGroupSize, workItems, dim);
|
||||
} else {
|
||||
|
||||
@@ -117,18 +117,21 @@ std::map<std::string, size_t> typeSizeMap = {
|
||||
{"double8", sizeof(cl_double8)},
|
||||
{"double16", sizeof(cl_double16)},
|
||||
};
|
||||
WorkSizeInfo::WorkSizeInfo(uint32_t maxWorkGroupSize, bool hasBarriers, uint32_t simdSize, uint32_t slmTotalSize, GFXCORE_FAMILY coreFamily, uint32_t numThreadsPerSubSlice, uint32_t localMemSize, bool imgUsed, bool yTiledSurface) {
|
||||
|
||||
WorkSizeInfo::WorkSizeInfo(uint32_t maxWorkGroupSize, bool hasBarriers, uint32_t simdSize, uint32_t slmTotalSize, const HardwareInfo *hwInfo, uint32_t numThreadsPerSubSlice, uint32_t localMemSize, bool imgUsed, bool yTiledSurface) {
|
||||
this->maxWorkGroupSize = maxWorkGroupSize;
|
||||
this->hasBarriers = hasBarriers;
|
||||
this->simdSize = simdSize;
|
||||
this->slmTotalSize = slmTotalSize;
|
||||
this->coreFamily = coreFamily;
|
||||
this->coreFamily = hwInfo->platform.eRenderCoreFamily;
|
||||
this->numThreadsPerSubSlice = numThreadsPerSubSlice;
|
||||
this->localMemSize = localMemSize;
|
||||
this->imgUsed = imgUsed;
|
||||
this->yTiledSurfaces = yTiledSurface;
|
||||
setMinWorkGroupSize();
|
||||
|
||||
setMinWorkGroupSize(hwInfo);
|
||||
}
|
||||
|
||||
WorkSizeInfo::WorkSizeInfo(const DispatchInfo &dispatchInfo) {
|
||||
auto &device = dispatchInfo.getClDevice();
|
||||
const auto &kernelInfo = dispatchInfo.getKernel()->getKernelInfo();
|
||||
@@ -140,8 +143,9 @@ WorkSizeInfo::WorkSizeInfo(const DispatchInfo &dispatchInfo) {
|
||||
this->numThreadsPerSubSlice = static_cast<uint32_t>(device.getSharedDeviceInfo().maxNumEUsPerSubSlice) *
|
||||
device.getSharedDeviceInfo().numThreadsPerEU;
|
||||
this->localMemSize = static_cast<uint32_t>(device.getSharedDeviceInfo().localMemSize);
|
||||
|
||||
setIfUseImg(kernelInfo);
|
||||
setMinWorkGroupSize();
|
||||
setMinWorkGroupSize(&device.getHardwareInfo());
|
||||
}
|
||||
void WorkSizeInfo::setIfUseImg(const KernelInfo &kernelInfo) {
|
||||
for (const auto &arg : kernelInfo.kernelDescriptor.payloadMappings.explicitArgs) {
|
||||
@@ -152,7 +156,7 @@ void WorkSizeInfo::setIfUseImg(const KernelInfo &kernelInfo) {
|
||||
}
|
||||
}
|
||||
}
|
||||
void WorkSizeInfo::setMinWorkGroupSize() {
|
||||
void WorkSizeInfo::setMinWorkGroupSize(const HardwareInfo *hwInfo) {
|
||||
minWorkGroupSize = 0;
|
||||
if (hasBarriers) {
|
||||
uint32_t maxBarriersPerHSlice = (coreFamily >= IGFX_GEN9_CORE) ? 32 : 16;
|
||||
@@ -161,6 +165,11 @@ void WorkSizeInfo::setMinWorkGroupSize() {
|
||||
if (slmTotalSize > 0) {
|
||||
minWorkGroupSize = std::max(maxWorkGroupSize / ((localMemSize / slmTotalSize)), minWorkGroupSize);
|
||||
}
|
||||
|
||||
const auto &hwHelper = HwHelper::get(hwInfo->platform.eRenderCoreFamily);
|
||||
if (hwHelper.isFusedEuDispatchEnabled(*hwInfo)) {
|
||||
minWorkGroupSize *= 2;
|
||||
}
|
||||
}
|
||||
void WorkSizeInfo::checkRatio(const size_t workItems[3]) {
|
||||
if (slmTotalSize > 0) {
|
||||
|
||||
@@ -55,10 +55,10 @@ struct WorkSizeInfo {
|
||||
bool useStrictRatio = false;
|
||||
float targetRatio = 0;
|
||||
|
||||
WorkSizeInfo(uint32_t maxWorkGroupSize, bool hasBarriers, uint32_t simdSize, uint32_t slmTotalSize, GFXCORE_FAMILY coreFamily, uint32_t numThreadsPerSubSlice, uint32_t localMemSize, bool imgUsed, bool yTiledSurface);
|
||||
WorkSizeInfo(uint32_t maxWorkGroupSize, bool hasBarriers, uint32_t simdSize, uint32_t slmTotalSize, const HardwareInfo *hwInfo, uint32_t numThreadsPerSubSlice, uint32_t localMemSize, bool imgUsed, bool yTiledSurface);
|
||||
WorkSizeInfo(const DispatchInfo &dispatchInfo);
|
||||
void setIfUseImg(const KernelInfo &kernelInfo);
|
||||
void setMinWorkGroupSize();
|
||||
void setMinWorkGroupSize(const HardwareInfo *hwInfo);
|
||||
void checkRatio(const size_t workItems[3]);
|
||||
};
|
||||
|
||||
|
||||
@@ -11,14 +11,13 @@
|
||||
#include "opencl/source/command_queue/gpgpu_walker.h"
|
||||
#include "opencl/test/unit_test/mocks/mock_cl_device.h"
|
||||
#include "opencl/test/unit_test/mocks/mock_kernel.h"
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
#include "test.h"
|
||||
|
||||
using namespace NEO;
|
||||
|
||||
TEST(localWorkSizeTest, given3DimWorkGroupAndSimdEqual8AndBarriersWhenComputeCalledThenLocalGroupComputedCorrectly) {
|
||||
//wsInfo maxWorkGroupSize, hasBariers, simdSize, slmTotalSize, coreFamily, numThreadsPerSubSlice, localMemorySize, imgUsed, yTiledSurface
|
||||
WorkSizeInfo wsInfo(256, 1u, 8, 0u, defaultHwInfo->platform.eRenderCoreFamily, 32u, 0u, false, false);
|
||||
//wsInfo maxWorkGroupSize, hasBariers, simdSize, slmTotalSize, hardwareInfo, numThreadsPerSubSlice, localMemorySize, imgUsed, yTiledSurface
|
||||
WorkSizeInfo wsInfo(256, 1u, 8, 0u, defaultHwInfo.get(), 32u, 0u, false, false);
|
||||
uint32_t workDim = 3;
|
||||
size_t workGroup[3] = {10000, 10000, 10000};
|
||||
size_t workGroupSize[3];
|
||||
@@ -41,8 +40,8 @@ TEST(localWorkSizeTest, given2DimWorkGroupAndSimdEqual8AndNoBarriersWhenComputeC
|
||||
DebugManagerStateRestore dbgRestore;
|
||||
DebugManager.flags.EnableComputeWorkSizeSquared.set(true);
|
||||
|
||||
//wsInfo maxWorkGroupSize, hasBariers, simdSize, slmTotalSize, coreFamily, numThreadsPerSubSlice, localMemorySize, imgUsed, yTiledSurface
|
||||
WorkSizeInfo wsInfo(256, 0u, 8, 0u, defaultHwInfo->platform.eRenderCoreFamily, 32u, 0u, false, false);
|
||||
//wsInfo maxWorkGroupSize, hasBariers, simdSize, slmTotalSize, hardwareInfo, numThreadsPerSubSlice, localMemorySize, imgUsed, yTiledSurface
|
||||
WorkSizeInfo wsInfo(256, 0u, 8, 0u, defaultHwInfo.get(), 32u, 0u, false, false);
|
||||
uint32_t workDim = 2;
|
||||
size_t workGroup[3] = {10003, 10003, 1};
|
||||
size_t workGroupSize[3];
|
||||
@@ -61,8 +60,8 @@ TEST(localWorkSizeTest, given2DimWorkGroupAndSimdEqual8AndNoBarriersWhenComputeC
|
||||
}
|
||||
|
||||
TEST(localWorkSizeTest, given1DimWorkGroupAndSimdEqual8WhenComputeCalledThenLocalGroupComputed) {
|
||||
//wsInfo maxWorkGroupSize, hasBariers, simdSize, slmTotalSize, coreFamily, numThreadsPerSubSlice, localMemorySize, imgUsed, yTiledSurface
|
||||
WorkSizeInfo wsInfo(256, 0u, 8, 0u, defaultHwInfo->platform.eRenderCoreFamily, 32u, 0u, false, false);
|
||||
//wsInfo maxWorkGroupSize, hasBariers, simdSize, slmTotalSize, hardwareInfo, numThreadsPerSubSlice, localMemorySize, imgUsed, yTiledSurface
|
||||
WorkSizeInfo wsInfo(256, 0u, 8, 0u, defaultHwInfo.get(), 32u, 0u, false, false);
|
||||
uint32_t workDim = 1;
|
||||
size_t workGroup[3] = {6144, 1, 1};
|
||||
size_t workGroupSize[3];
|
||||
@@ -92,7 +91,7 @@ TEST(localWorkSizeTest, given1DimWorkGroupAndSimdEqual8WhenComputeCalledThenLoca
|
||||
}
|
||||
|
||||
TEST(localWorkSizeTest, given1DimWorkGroupAndSimdEqual32WhenComputeCalledThenLocalGroupComputed) {
|
||||
WorkSizeInfo wsInfo(256, 0u, 32, 0u, defaultHwInfo->platform.eRenderCoreFamily, 32u, 0u, false, false);
|
||||
WorkSizeInfo wsInfo(256, 0u, 32, 0u, defaultHwInfo.get(), 32u, 0u, false, false);
|
||||
uint32_t workDim = 1;
|
||||
size_t workGroup[3] = {6144, 1, 1};
|
||||
size_t workGroupSize[3];
|
||||
@@ -116,7 +115,7 @@ TEST(localWorkSizeTest, given1DimWorkGroupAndSimdEqual32WhenComputeCalledThenLoc
|
||||
}
|
||||
|
||||
TEST(localWorkSizeTest, given2DimWorkGroupAndSimdEqual8WhenComputeCalledThenLocalGroupComputed) {
|
||||
WorkSizeInfo wsInfo(256, 0u, 8, 0u, defaultHwInfo->platform.eRenderCoreFamily, 56u, 0u, false, false);
|
||||
WorkSizeInfo wsInfo(256, 0u, 8, 0u, defaultHwInfo.get(), 56u, 0u, false, false);
|
||||
uint32_t workDim = 2;
|
||||
size_t workGroup[3] = {384, 96, 1};
|
||||
size_t workGroupSize[3];
|
||||
@@ -144,7 +143,7 @@ TEST(localWorkSizeTest, given2DimWorkGroupAndSimdEqual8WhenComputeCalledThenLoca
|
||||
TEST(localWorkSizeTest, given2DimWorkGroupAndSimdEqual32WhenComputeCalledThenLocalGroupComputed) {
|
||||
DebugManagerStateRestore dbgRestore;
|
||||
DebugManager.flags.EnableComputeWorkSizeSquared.set(false);
|
||||
WorkSizeInfo wsInfo(256, 0u, 32, 0u, defaultHwInfo->platform.eRenderCoreFamily, 32u, 0u, false, false);
|
||||
WorkSizeInfo wsInfo(256, 0u, 32, 0u, defaultHwInfo.get(), 32u, 0u, false, false);
|
||||
uint32_t workDim = 2;
|
||||
size_t workGroup[3] = {384, 96, 1};
|
||||
size_t workGroupSize[3];
|
||||
@@ -176,7 +175,7 @@ TEST(localWorkSizeTest, given2DimWorkGroupAndSimdEqual32WhenComputeCalledThenLoc
|
||||
}
|
||||
|
||||
TEST(localWorkSizeTest, given3DimWorkGroupAndSimdEqual8WhenComputeCalledThenLocalGroupComputed) {
|
||||
WorkSizeInfo wsInfo(256, 0u, 8, 0u, defaultHwInfo->platform.eRenderCoreFamily, 56u, 0u, false, false);
|
||||
WorkSizeInfo wsInfo(256, 0u, 8, 0u, defaultHwInfo.get(), 56u, 0u, false, false);
|
||||
uint32_t workDim = 3;
|
||||
size_t workGroup[3] = {384, 384, 384};
|
||||
size_t workGroupSize[3];
|
||||
@@ -212,7 +211,7 @@ TEST(localWorkSizeTest, given3DimWorkGroupAndSimdEqual8WhenComputeCalledThenLoca
|
||||
}
|
||||
|
||||
TEST(localWorkSizeTest, given3DimWorkGroupAndSimdEqual32WhenComputeCalledThenLocalGroupComputed) {
|
||||
NEO::WorkSizeInfo wsInfo(256, 0u, 32, 0u, defaultHwInfo->platform.eRenderCoreFamily, 32u, 0u, false, false);
|
||||
NEO::WorkSizeInfo wsInfo(256, 0u, 32, 0u, defaultHwInfo.get(), 32u, 0u, false, false);
|
||||
uint32_t workDim = 3;
|
||||
size_t workGroup[3] = {384, 384, 384};
|
||||
size_t workGroupSize[3];
|
||||
@@ -258,7 +257,7 @@ TEST(localWorkSizeTest, given2DimWorkGroupAndSquaredAlgorithmWhenComputeCalledTh
|
||||
DebugManagerStateRestore dbgRestore;
|
||||
DebugManager.flags.EnableComputeWorkSizeSquared.set(true);
|
||||
|
||||
WorkSizeInfo wsInfo(256, 0u, 32, 0u, defaultHwInfo->platform.eRenderCoreFamily, 32u, 0u, false, false);
|
||||
WorkSizeInfo wsInfo(256, 0u, 32, 0u, defaultHwInfo.get(), 32u, 0u, false, false);
|
||||
uint32_t workDim = 2;
|
||||
size_t workGroup[3] = {384, 96, 1};
|
||||
size_t workGroupSize[3];
|
||||
@@ -273,7 +272,7 @@ TEST(localWorkSizeTest, given1DimWorkGroupAndSquaredAlgorithmOnWhenComputeCalled
|
||||
DebugManagerStateRestore dbgRestore;
|
||||
DebugManager.flags.EnableComputeWorkSizeSquared.set(true);
|
||||
|
||||
WorkSizeInfo wsInfo(256, 0u, 32, 0u, defaultHwInfo->platform.eRenderCoreFamily, 32u, 0u, false, false);
|
||||
WorkSizeInfo wsInfo(256, 0u, 32, 0u, defaultHwInfo.get(), 32u, 0u, false, false);
|
||||
uint32_t workDim = 1;
|
||||
size_t workGroup[3] = {1024, 1, 1};
|
||||
size_t workGroupSize[3];
|
||||
@@ -288,7 +287,7 @@ TEST(localWorkSizeTest, given2DdispatchWithImagesAndSquaredAlgorithmOnWhenLwsIsC
|
||||
DebugManagerStateRestore dbgRestore;
|
||||
DebugManager.flags.EnableComputeWorkSizeSquared.set(true);
|
||||
|
||||
WorkSizeInfo wsInfo(256, 0u, 32, 0u, defaultHwInfo->platform.eRenderCoreFamily, 32u, 0u, true, false);
|
||||
WorkSizeInfo wsInfo(256, 0u, 32, 0u, defaultHwInfo.get(), 32u, 0u, true, false);
|
||||
uint32_t workDim = 2;
|
||||
size_t workGroup[3] = {256, 96, 1};
|
||||
size_t workGroupSize[3];
|
||||
@@ -300,7 +299,7 @@ TEST(localWorkSizeTest, given2DdispatchWithImagesAndSquaredAlgorithmOnWhenLwsIsC
|
||||
}
|
||||
|
||||
TEST(localWorkSizeTest, givenKernelWithTileYImagesAndBarrierWhenWorkgroupSizeIsComputedThenItMimicsTilingPattern) {
|
||||
WorkSizeInfo wsInfo(256, true, 32, 0u, defaultHwInfo->platform.eRenderCoreFamily, 32u, 0u, true, true);
|
||||
WorkSizeInfo wsInfo(256, true, 32, 0u, defaultHwInfo.get(), 32u, 0u, true, true);
|
||||
uint32_t workDim = 2;
|
||||
size_t workGroup[3] = {1, 1, 1};
|
||||
size_t workGroupSize[3];
|
||||
@@ -321,7 +320,7 @@ TEST(localWorkSizeTest, givenKernelWithTileYImagesAndBarrierWhenWorkgroupSizeIsC
|
||||
}
|
||||
|
||||
TEST(localWorkSizeTest, givenKernelWithTileYImagesAndNoBarriersWhenWorkgroupSizeIsComputedThenItMimicsTilingPattern) {
|
||||
WorkSizeInfo wsInfo(256, false, 32, 0u, defaultHwInfo->platform.eRenderCoreFamily, 32u, 0u, true, true);
|
||||
WorkSizeInfo wsInfo(256, false, 32, 0u, defaultHwInfo.get(), 32u, 0u, true, true);
|
||||
uint32_t workDim = 2;
|
||||
size_t workGroup[3] = {1, 1, 1};
|
||||
size_t workGroupSize[3];
|
||||
@@ -342,7 +341,7 @@ TEST(localWorkSizeTest, givenKernelWithTileYImagesAndNoBarriersWhenWorkgroupSize
|
||||
}
|
||||
|
||||
TEST(localWorkSizeTest, givenSimd16KernelWithTileYImagesAndNoBarriersWhenWorkgroupSizeIsComputedThenItMimicsTilingPattern) {
|
||||
WorkSizeInfo wsInfo(256, false, 16, 0u, defaultHwInfo->platform.eRenderCoreFamily, 32u, 0u, true, true);
|
||||
WorkSizeInfo wsInfo(256, false, 16, 0u, defaultHwInfo.get(), 32u, 0u, true, true);
|
||||
uint32_t workDim = 2;
|
||||
size_t workGroup[3] = {1, 1, 1};
|
||||
size_t workGroupSize[3];
|
||||
@@ -363,7 +362,7 @@ TEST(localWorkSizeTest, givenSimd16KernelWithTileYImagesAndNoBarriersWhenWorkgro
|
||||
}
|
||||
|
||||
TEST(localWorkSizeTest, givenKernelWithTwoDimensionalGlobalSizesWhenLwsIsComputedThenItHasMaxWorkgroupSize) {
|
||||
WorkSizeInfo wsInfo(256, 0u, 32, 0u, defaultHwInfo->platform.eRenderCoreFamily, 32u, 0u, false, false);
|
||||
WorkSizeInfo wsInfo(256, 0u, 32, 0u, defaultHwInfo.get(), 32u, 0u, false, false);
|
||||
uint32_t workDim = 2;
|
||||
size_t workGroup[3] = {1, 1, 1};
|
||||
size_t workGroupSize[3];
|
||||
@@ -377,7 +376,7 @@ TEST(localWorkSizeTest, givenKernelWithTwoDimensionalGlobalSizesWhenLwsIsCompute
|
||||
}
|
||||
|
||||
TEST(localWorkSizeTest, givenKernelWithBarriersAndTiledImagesWithYdimensionHigherThenXDimensionWhenLwsIsComputedThenItMimicsTiling) {
|
||||
WorkSizeInfo wsInfo(256, 0u, 32, 0u, defaultHwInfo->platform.eRenderCoreFamily, 32u, 0u, true, true);
|
||||
WorkSizeInfo wsInfo(256, 0u, 32, 0u, defaultHwInfo.get(), 32u, 0u, true, true);
|
||||
uint32_t workDim = 2;
|
||||
size_t workGroup[3] = {1, 1, 1};
|
||||
size_t workGroupSize[3];
|
||||
@@ -412,7 +411,7 @@ TEST(localWorkSizeTest, givenKernelWithBarriersAndTiledImagesWithYdimensionHighe
|
||||
}
|
||||
|
||||
TEST(localWorkSizeTest, givenHighOneDimensionalGwsWhenLwsIsComputedThenMaxWorkgoupSizeIsUsed) {
|
||||
WorkSizeInfo wsInfo(256, 0u, 32, 0u, defaultHwInfo->platform.eRenderCoreFamily, 32u, 0u, false, false);
|
||||
WorkSizeInfo wsInfo(256, 0u, 32, 0u, defaultHwInfo.get(), 32u, 0u, false, false);
|
||||
uint32_t workDim = 2;
|
||||
size_t workGroup[3] = {1, 1, 1};
|
||||
size_t workGroupSize[3];
|
||||
@@ -433,7 +432,7 @@ TEST(localWorkSizeTest, givenHighOneDimensionalGwsWhenLwsIsComputedThenMaxWorkgo
|
||||
}
|
||||
|
||||
TEST(localWorkSizeTest, givenVeriousGwsSizesWithImagesWhenLwsIsComputedThenProperSizesAreReturned) {
|
||||
WorkSizeInfo wsInfo(256, 0u, 32, 0u, defaultHwInfo->platform.eRenderCoreFamily, 32u, 0u, true, true);
|
||||
WorkSizeInfo wsInfo(256, 0u, 32, 0u, defaultHwInfo.get(), 32u, 0u, true, true);
|
||||
uint32_t workDim = 2;
|
||||
size_t workGroup[3] = {1, 1, 1};
|
||||
size_t workGroupSize[3];
|
||||
@@ -477,7 +476,7 @@ TEST(localWorkSizeTest, givenVeriousGwsSizesWithImagesWhenLwsIsComputedThenPrope
|
||||
}
|
||||
|
||||
TEST(localWorkSizeTest, givenHigh1DGwsAndSimdSize16WhenLwsIsComputedThenMaxWorkgroupSizeIsChoosen) {
|
||||
WorkSizeInfo wsInfo(256u, 0u, 16, 0u, defaultHwInfo->platform.eRenderCoreFamily, 56u, 0, false, false);
|
||||
WorkSizeInfo wsInfo(256u, 0u, 16, 0u, defaultHwInfo.get(), 56u, 0, false, false);
|
||||
|
||||
size_t workGroup[3] = {1, 1, 1};
|
||||
size_t workGroupSize[3];
|
||||
@@ -490,7 +489,7 @@ TEST(localWorkSizeTest, givenHigh1DGwsAndSimdSize16WhenLwsIsComputedThenMaxWorkg
|
||||
}
|
||||
|
||||
TEST(localWorkSizeTest, givenHigh1DGwsAndSimdSize8WhenLwsIsComputedThenMaxWorkgroupSizeIsChoosen) {
|
||||
WorkSizeInfo wsInfo(256u, 0u, 8, 0u, defaultHwInfo->platform.eRenderCoreFamily, 32u, 0, false, false);
|
||||
WorkSizeInfo wsInfo(256u, 0u, 8, 0u, defaultHwInfo.get(), 32u, 0, false, false);
|
||||
|
||||
size_t workGroup[3] = {1, 1, 1};
|
||||
size_t workGroupSize[3];
|
||||
@@ -503,7 +502,7 @@ TEST(localWorkSizeTest, givenHigh1DGwsAndSimdSize8WhenLwsIsComputedThenMaxWorkgr
|
||||
}
|
||||
|
||||
TEST(localWorkSizeTest, givenKernelUtilizingImagesAndSlmWhenLwsIsBeingComputedThenItMimicsGlobalWorkgroupSizes) {
|
||||
WorkSizeInfo wsInfo(256u, 1u, 32, 4096u, defaultHwInfo->platform.eRenderCoreFamily, 56u, 65536u, true, true);
|
||||
WorkSizeInfo wsInfo(256u, 1u, 32, 4096u, defaultHwInfo.get(), 56u, 65536u, true, true);
|
||||
uint32_t workDim = 2;
|
||||
size_t workGroup[3] = {1, 1, 1};
|
||||
size_t workGroupSize[3];
|
||||
@@ -524,7 +523,7 @@ TEST(localWorkSizeTest, givenKernelUtilizingImagesAndSlmWhenLwsIsBeingComputedTh
|
||||
}
|
||||
|
||||
TEST(localWorkSizeTest, GivenUseStrictRatioWhenLwsIsBeingComputedThenWgsIsCalculatedCorrectly) {
|
||||
WorkSizeInfo wsInfo(256u, 0u, 32u, 0u, defaultHwInfo->platform.eRenderCoreFamily, 0u, 0u, true, true);
|
||||
WorkSizeInfo wsInfo(256u, 0u, 32u, 0u, defaultHwInfo.get(), 0u, 0u, true, true);
|
||||
uint32_t workDim = 2;
|
||||
size_t workGroup[3] = {194, 234, 1};
|
||||
size_t workGroupSize[3];
|
||||
@@ -552,7 +551,7 @@ TEST(localWorkSizeTest, GivenUseStrictRatioWhenLwsIsBeingComputedThenWgsIsCalcul
|
||||
}
|
||||
|
||||
TEST(localWorkSizeTest, GivenUseBarriersWhenLwsIsBeingComputedThenWgsIsCalculatedCorrectly) {
|
||||
WorkSizeInfo wsInfo(256u, 1u, 32u, 0u, defaultHwInfo->platform.eRenderCoreFamily, 56u, 0u, true, true);
|
||||
WorkSizeInfo wsInfo(256u, 1u, 32u, 0u, defaultHwInfo.get(), 56u, 0u, true, true);
|
||||
uint32_t workDim = 2;
|
||||
size_t workGroup[3] = {194, 234, 1};
|
||||
size_t workGroupSize[3];
|
||||
@@ -578,7 +577,7 @@ TEST(localWorkSizeTest, GivenUseBarriersWhenLwsIsBeingComputedThenWgsIsCalculate
|
||||
}
|
||||
|
||||
TEST(localWorkSizeTest, given2DimWorkWhenComputeSquaredCalledThenLocalGroupComputed) {
|
||||
WorkSizeInfo wsInfo(256, 0u, 16, 0u, defaultHwInfo->platform.eRenderCoreFamily, 6u, 0u, false, false);
|
||||
WorkSizeInfo wsInfo(256, 0u, 16, 0u, defaultHwInfo.get(), 6u, 0u, false, false);
|
||||
uint32_t workDim = 2;
|
||||
size_t workGroup[3] = {2048, 272, 1};
|
||||
size_t workGroupSize[3];
|
||||
@@ -627,7 +626,7 @@ TEST(localWorkSizeTest, given2DimWorkWhenComputeSquaredCalledThenLocalGroupCompu
|
||||
TEST(localWorkSizeTest, givenDeviceSupportingLws1024AndKernelCompiledInSimd8WhenGwsIs1024ThenLwsIsComputedAsMaxOptimalMultipliedBySimd) {
|
||||
DebugManagerStateRestore dbgRestore;
|
||||
DebugManager.flags.EnableComputeWorkSizeSquared.set(false);
|
||||
WorkSizeInfo wsInfo(1024, 0u, 8, 0u, defaultHwInfo->platform.eRenderCoreFamily, 56u, 0u, false, false);
|
||||
WorkSizeInfo wsInfo(1024, 0u, 8, 0u, defaultHwInfo.get(), 56u, 0u, false, false);
|
||||
|
||||
uint32_t workDim = 2;
|
||||
size_t workGroup[3] = {32, 32, 1};
|
||||
@@ -642,7 +641,7 @@ TEST(localWorkSizeTest, givenDeviceSupportingLws1024AndKernelCompiledInSimd8When
|
||||
TEST(localWorkSizeTest, givenDeviceWith36ThreadsPerSubsliceWhenSimd16KernelIsBeingSubmittedThenWorkgroupContainsOf8HwThreads) {
|
||||
DebugManagerStateRestore dbgRestore;
|
||||
DebugManager.flags.EnableComputeWorkSizeSquared.set(false);
|
||||
WorkSizeInfo wsInfo(256, 0u, 16, 0u, defaultHwInfo->platform.eRenderCoreFamily, 36u, 0u, false, false);
|
||||
WorkSizeInfo wsInfo(256, 0u, 16, 0u, defaultHwInfo.get(), 36u, 0u, false, false);
|
||||
|
||||
uint32_t workDim = 2;
|
||||
size_t workGroup[3] = {1024, 1024, 1};
|
||||
@@ -657,7 +656,7 @@ TEST(localWorkSizeTest, givenDeviceWith36ThreadsPerSubsliceWhenSimd16KernelIsBei
|
||||
TEST(localWorkSizeTest, givenDeviceWith56ThreadsPerSubsliceWhenSimd16KernelIsBeingSubmittedThenWorkgroupContainsOf16HwThreads) {
|
||||
DebugManagerStateRestore dbgRestore;
|
||||
DebugManager.flags.EnableComputeWorkSizeSquared.set(false);
|
||||
WorkSizeInfo wsInfo(256, 0u, 16, 0u, defaultHwInfo->platform.eRenderCoreFamily, 56u, 0u, false, false);
|
||||
WorkSizeInfo wsInfo(256, 0u, 16, 0u, defaultHwInfo.get(), 56u, 0u, false, false);
|
||||
|
||||
uint32_t workDim = 2;
|
||||
size_t workGroup[3] = {1024, 1024, 1};
|
||||
@@ -687,6 +686,84 @@ TEST(localWorkSizeTest, givenDispatchInfoWhenWorkSizeInfoIsCreatedThenItHasCorre
|
||||
EXPECT_EQ(workSizeInfo.numThreadsPerSubSlice, threadsPerEu * euPerSubSlice);
|
||||
}
|
||||
|
||||
TEST(localWorkSizeTest, givenDispatchInfoWhenWorkSizeInfoIsCreatedThenTestEuFusionFtr) {
|
||||
MockClDevice device{new MockDevice};
|
||||
MockKernelWithInternals kernel(device);
|
||||
kernel.kernelInfo.kernelDescriptor.kernelAttributes.barrierCount = 1;
|
||||
DispatchInfo dispatchInfo;
|
||||
dispatchInfo.setClDevice(&device);
|
||||
dispatchInfo.setKernel(kernel.mockKernel);
|
||||
|
||||
const uint32_t maxBarriersPerHSlice = (defaultHwInfo->platform.eRenderCoreFamily >= IGFX_GEN9_CORE) ? 32 : 16;
|
||||
const uint32_t nonFusedMinWorkGroupSize = static_cast<uint32_t>(device.getSharedDeviceInfo().maxNumEUsPerSubSlice) *
|
||||
device.getSharedDeviceInfo().numThreadsPerEU *
|
||||
static_cast<uint32_t>(kernel.mockKernel->getKernelInfo().getMaxSimdSize()) /
|
||||
maxBarriersPerHSlice;
|
||||
const uint32_t fusedMinWorkGroupSize = 2 * nonFusedMinWorkGroupSize;
|
||||
WorkSizeInfo workSizeInfo(dispatchInfo);
|
||||
|
||||
if (defaultHwInfo->platform.eRenderCoreFamily < IGFX_GEN12_CORE) {
|
||||
EXPECT_EQ(nonFusedMinWorkGroupSize, workSizeInfo.minWorkGroupSize);
|
||||
} else {
|
||||
EXPECT_EQ(fusedMinWorkGroupSize, workSizeInfo.minWorkGroupSize);
|
||||
}
|
||||
}
|
||||
|
||||
using LocalWorkSizeTest = ::testing::Test;
|
||||
|
||||
HWTEST2_F(LocalWorkSizeTest, givenDispatchInfoWhenWorkSizeInfoIsCreatedThenTestEuFusionFtrForcedByDebugManager, IsAtLeastGen12lp) {
|
||||
DebugManagerStateRestore dbgRestore;
|
||||
MockClDevice device{new MockDevice};
|
||||
MockKernelWithInternals kernel(device);
|
||||
kernel.kernelInfo.kernelDescriptor.kernelAttributes.barrierCount = 1;
|
||||
DispatchInfo dispatchInfo;
|
||||
dispatchInfo.setClDevice(&device);
|
||||
dispatchInfo.setKernel(kernel.mockKernel);
|
||||
|
||||
const uint32_t nonFusedMinWorkGroupSize = static_cast<uint32_t>(device.getSharedDeviceInfo().maxNumEUsPerSubSlice) *
|
||||
device.getSharedDeviceInfo().numThreadsPerEU *
|
||||
static_cast<uint32_t>(kernel.mockKernel->getKernelInfo().getMaxSimdSize()) /
|
||||
32;
|
||||
const uint32_t fusedMinWorkGroupSize = 2 * nonFusedMinWorkGroupSize;
|
||||
EXPECT_NE(0u, nonFusedMinWorkGroupSize);
|
||||
|
||||
{
|
||||
const bool fusedEuDispatchDisabled = true;
|
||||
DebugManager.flags.CFEFusedEUDispatch.set(fusedEuDispatchDisabled);
|
||||
WorkSizeInfo workSizeInfo(dispatchInfo);
|
||||
EXPECT_EQ(nonFusedMinWorkGroupSize, workSizeInfo.minWorkGroupSize);
|
||||
}
|
||||
|
||||
{
|
||||
const bool fusedEuDispatchDisabled = false;
|
||||
DebugManager.flags.CFEFusedEUDispatch.set(fusedEuDispatchDisabled);
|
||||
WorkSizeInfo workSizeInfo(dispatchInfo);
|
||||
EXPECT_EQ(fusedMinWorkGroupSize, workSizeInfo.minWorkGroupSize);
|
||||
}
|
||||
}
|
||||
|
||||
HWTEST2_F(LocalWorkSizeTest, givenWorkSizeInfoIsCreatedWithHwInfoThenTestEuFusionFtrForcedByDebugManager, IsAtLeastGen12lp) {
|
||||
DebugManagerStateRestore dbgRestore;
|
||||
|
||||
const uint32_t nonFusedMinWorkGroupSize = 36 * 16 / 32;
|
||||
const uint32_t fusedMinWorkGroupSize = 2 * nonFusedMinWorkGroupSize;
|
||||
EXPECT_NE(0u, nonFusedMinWorkGroupSize);
|
||||
|
||||
{
|
||||
const bool fusedEuDispatchDisabled = true;
|
||||
DebugManager.flags.CFEFusedEUDispatch.set(fusedEuDispatchDisabled);
|
||||
WorkSizeInfo workSizeInfo(512, 1u, 16, 0u, defaultHwInfo.get(), 36u, 0u, false, false);
|
||||
EXPECT_EQ(nonFusedMinWorkGroupSize, workSizeInfo.minWorkGroupSize);
|
||||
}
|
||||
|
||||
{
|
||||
const bool fusedEuDispatchDisabled = false;
|
||||
DebugManager.flags.CFEFusedEUDispatch.set(fusedEuDispatchDisabled);
|
||||
WorkSizeInfo workSizeInfo(512, 1u, 16, 0u, defaultHwInfo.get(), 36u, 0u, false, false);
|
||||
EXPECT_EQ(fusedMinWorkGroupSize, workSizeInfo.minWorkGroupSize);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(localWorkSizeTest, givenDispatchInfoWhenWorkSizeInfoIsCreatedThenHasBarriersIsCorrectlySet) {
|
||||
MockClDevice device{new MockDevice};
|
||||
MockKernelWithInternals kernel(device);
|
||||
@@ -702,7 +779,7 @@ TEST(localWorkSizeTest, givenDispatchInfoWhenWorkSizeInfoIsCreatedThenHasBarrier
|
||||
}
|
||||
|
||||
TEST(localWorkSizeTest, givenMaxWorkgroupSizeEqualToSimdSizeWhenLwsIsCalculatedThenItIsDownsizedToMaxWorkgroupSize) {
|
||||
WorkSizeInfo wsInfo(32, 0u, 32, 0u, defaultHwInfo->platform.eRenderCoreFamily, 32u, 0u, false, false);
|
||||
WorkSizeInfo wsInfo(32, 0u, 32, 0u, defaultHwInfo.get(), 32u, 0u, false, false);
|
||||
uint32_t workDim = 2;
|
||||
size_t workGroup[3] = {32, 32, 1};
|
||||
size_t workGroupSize[3];
|
||||
|
||||
@@ -55,7 +55,7 @@ struct WorkGroupSizeBase {
|
||||
size_t workGroupSize[3];
|
||||
auto maxWorkGroupSize = 256u;
|
||||
if (DebugManager.flags.EnableComputeWorkSizeND.get()) {
|
||||
WorkSizeInfo wsInfo(maxWorkGroupSize, 0u, simdSize, 0u, IGFX_GEN9_CORE, 32u, 0u, false, false);
|
||||
WorkSizeInfo wsInfo(maxWorkGroupSize, 0u, simdSize, 0u, ::defaultHwInfo.get(), 32u, 0u, false, false);
|
||||
computeWorkgroupSizeND(wsInfo, workGroupSize, workItems, dims);
|
||||
} else {
|
||||
if (dims == 1)
|
||||
|
||||
@@ -799,7 +799,7 @@ TEST_F(PerformanceHintEnqueueKernelPrintfTest, GivenKernelWithPrintfWhenEnqueueK
|
||||
size_t preferredWorkGroupSize[3];
|
||||
auto maxWorkGroupSize = static_cast<uint32_t>(pPlatform->getClDevice(0)->getSharedDeviceInfo().maxWorkGroupSize);
|
||||
if (DebugManager.flags.EnableComputeWorkSizeND.get()) {
|
||||
WorkSizeInfo wsInfo(maxWorkGroupSize, 0u, 32u, 0u, IGFX_GEN9_CORE, 32u, 0u, false, false);
|
||||
WorkSizeInfo wsInfo(maxWorkGroupSize, 0u, 32u, 0u, ::defaultHwInfo.get(), 32u, 0u, false, false);
|
||||
computeWorkgroupSizeND(wsInfo, preferredWorkGroupSize, globalWorkGroupSize, 2);
|
||||
} else
|
||||
computeWorkgroupSize2D(maxWorkGroupSize, preferredWorkGroupSize, globalWorkGroupSize, 32);
|
||||
@@ -818,7 +818,7 @@ TEST_F(PerformanceHintEnqueueTest, GivenKernelWithCoherentPtrWhenEnqueueKernelIs
|
||||
Kernel::SimpleKernelArgInfo kernelArgInfo;
|
||||
|
||||
if (DebugManager.flags.EnableComputeWorkSizeND.get()) {
|
||||
WorkSizeInfo wsInfo(maxWorkGroupSize, 0u, 32u, 0u, IGFX_GEN9_CORE, 32u, 0u, false, false);
|
||||
WorkSizeInfo wsInfo(maxWorkGroupSize, 0u, 32u, 0u, ::defaultHwInfo.get(), 32u, 0u, false, false);
|
||||
computeWorkgroupSizeND(wsInfo, preferredWorkGroupSize, globalWorkGroupSize, 2);
|
||||
} else
|
||||
computeWorkgroupSize2D(maxWorkGroupSize, preferredWorkGroupSize, globalWorkGroupSize, 32);
|
||||
|
||||
Reference in New Issue
Block a user