mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-29 17:13:29 +08:00
fix(l0): check for largeGRF when computing maxWorkGroupSize
Sizing context (PVC): When using LargeGRF (a.k.a GRF256) there are only 4 HW threads per EU (instead of default 8). Together with SIMD16 that means that there can be max 64 work-items per EU. With 8 EU per subslice this gives 512 work-items on a single subslice. For correct intra-WG synchronization all its WIs must be executed on the same subslice (to access the same SLM, where the synchronization primitives are stored). Thus, with SIMD16 and LargeGRF the work-group size must not exceed 512 (PVC example). So far `maxWorkGroupSize` is taken solely from a DeviceInfo structure both in `ModuleTranslationUnit::processUnpackedBinary()` and `ModuleImp::initialize()`. This method does not take kernel parameters (LargeGRF) into account. It allows to submit a kernel using LargeGRF with SIMD16 with the work-group size set to 1024. That leads to a hang. Fix the `.maxWorkGroupSize` computation so that it takes the kernel parameters into consideration. Add new (for discrete platforms >= XeHP) and adapt existing tests, fix cosmetics by the way. Similar check for OCL: https://github.com/intel/compute-runtime/blob/master/opencl/source/comma nd_queue/enqueue_kernel.h#L130 Related-To: NEO-7684 Signed-off-by: Maciej Bielski <maciej.bielski@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
a3aa7a1326
commit
2778043d67
@@ -282,7 +282,11 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
|
||||
Vec3<size_t> groupSize{groupSizeX, groupSizeY, groupSizeZ};
|
||||
auto itemsInGroup = Math::computeTotalElementsCount(groupSize);
|
||||
|
||||
if (itemsInGroup > module->getMaxGroupSize()) {
|
||||
const NEO::KernelDescriptor &kernelDescriptor = kernelImmData->getDescriptor();
|
||||
if (auto maxGroupSize = module->getMaxGroupSize(kernelDescriptor); itemsInGroup > maxGroupSize) {
|
||||
NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr,
|
||||
"Requested work-group size (%lu) exceeds maximum value (%u) for the kernel \"%s\" \n",
|
||||
itemsInGroup, maxGroupSize, kernelDescriptor.kernelMetadata.kernelName.c_str());
|
||||
DEBUG_BREAK_IF(true);
|
||||
return ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION;
|
||||
}
|
||||
@@ -290,7 +294,6 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
|
||||
this->groupSize[0] = groupSizeX;
|
||||
this->groupSize[1] = groupSizeY;
|
||||
this->groupSize[2] = groupSizeZ;
|
||||
const NEO::KernelDescriptor &kernelDescriptor = kernelImmData->getDescriptor();
|
||||
for (uint32_t i = 0u; i < 3u; i++) {
|
||||
if (kernelDescriptor.kernelAttributes.requiredWorkgroupSize[i] != 0 &&
|
||||
kernelDescriptor.kernelAttributes.requiredWorkgroupSize[i] != this->groupSize[i]) {
|
||||
@@ -349,14 +352,15 @@ ze_result_t KernelImp::suggestGroupSize(uint32_t globalSizeX, uint32_t globalSiz
|
||||
uint32_t globalSizeZ, uint32_t *groupSizeX,
|
||||
uint32_t *groupSizeY, uint32_t *groupSizeZ) {
|
||||
size_t retGroupSize[3] = {};
|
||||
auto maxWorkGroupSize = module->getMaxGroupSize();
|
||||
auto simd = kernelImmData->getDescriptor().kernelAttributes.simdSize;
|
||||
const auto &kernelDescriptor = this->getImmutableData()->getDescriptor();
|
||||
auto maxWorkGroupSize = module->getMaxGroupSize(kernelDescriptor);
|
||||
auto simd = kernelDescriptor.kernelAttributes.simdSize;
|
||||
size_t workItems[3] = {globalSizeX, globalSizeY, globalSizeZ};
|
||||
uint32_t dim = (globalSizeY > 1U) ? 2 : 1U;
|
||||
dim = (globalSizeZ > 1U) ? 3 : dim;
|
||||
|
||||
if (NEO::DebugManager.flags.EnableComputeWorkSizeND.get()) {
|
||||
auto usesImages = getImmutableData()->getDescriptor().kernelAttributes.flags.usesImages;
|
||||
auto usesImages = kernelDescriptor.kernelAttributes.flags.usesImages;
|
||||
auto neoDevice = module->getDevice()->getNEODevice();
|
||||
const auto &deviceInfo = neoDevice->getDeviceInfo();
|
||||
uint32_t numThreadsPerSubSlice = (uint32_t)deviceInfo.maxNumEUsPerSubSlice * deviceInfo.numThreadsPerEU;
|
||||
@@ -367,9 +371,9 @@ ze_result_t KernelImp::suggestGroupSize(uint32_t globalSizeX, uint32_t globalSiz
|
||||
return ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY;
|
||||
}
|
||||
|
||||
NEO::WorkSizeInfo wsInfo(maxWorkGroupSize, kernelImmData->getDescriptor().kernelAttributes.usesBarriers(), simd, this->getSlmTotalSize(),
|
||||
NEO::WorkSizeInfo wsInfo(maxWorkGroupSize, kernelDescriptor.kernelAttributes.usesBarriers(), simd, this->getSlmTotalSize(),
|
||||
neoDevice->getRootDeviceEnvironment(), numThreadsPerSubSlice, localMemSize,
|
||||
usesImages, false, kernelImmData->getDescriptor().kernelAttributes.flags.requiresDisabledEUFusion);
|
||||
usesImages, false, kernelDescriptor.kernelAttributes.flags.requiresDisabledEUFusion);
|
||||
NEO::computeWorkgroupSizeND(wsInfo, retGroupSize, workItems, dim);
|
||||
} else {
|
||||
if (1U == dim) {
|
||||
|
||||
Reference in New Issue
Block a user