mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-28 00:03:14 +08:00
Sizing context (PVC): When using LargeGRF (a.k.a GRF256) there are only 4 HW threads per EU (instead of default 8). Together with SIMD16 that means that there can be max 64 work-items per EU. With 8 EU per subslice this gives 512 work-items on a single subslice. For correct intra-WG synchronization all its WIs must be executed on the same subslice (to access the same SLM, where the synchronization primitives are stored). Thus, with SIMD16 and LargeGRF the work-group size must not exceed 512 (PVC example). So far `maxWorkGroupSize` is taken solely from a DeviceInfo structure both in `ModuleTranslationUnit::processUnpackedBinary()` and `ModuleImp::initialize()`. This method does not take kernel parameters (LargeGRF) into account. It allows to submit a kernel using LargeGRF with SIMD16 with the work-group size set to 1024. That leads to a hang. Fix the `.maxWorkGroupSize` computation so that it takes the kernel parameters into consideration. Add new (for discrete platforms >= XeHP) and adapt existing tests, fix cosmetics by the way. Similar check for OCL: https://github.com/intel/compute-runtime/blob/master/opencl/source/comma nd_queue/enqueue_kernel.h#L130 Related-To: NEO-7684 Signed-off-by: Maciej Bielski <maciej.bielski@intel.com>
73 lines
2.4 KiB
C++
73 lines
2.4 KiB
C++
/*
|
|
* Copyright (C) 2020-2023 Intel Corporation
|
|
*
|
|
* SPDX-License-Identifier: MIT
|
|
*
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include <level_zero/ze_api.h>
|
|
|
|
#include <memory>
|
|
#include <set>
|
|
#include <vector>
|
|
|
|
struct _ze_module_handle_t {};
|
|
|
|
namespace NEO {
|
|
struct KernelDescriptor;
|
|
}
|
|
|
|
namespace L0 {
|
|
struct Device;
|
|
struct ModuleBuildLog;
|
|
struct KernelImmutableData;
|
|
|
|
enum class ModuleType {
|
|
Builtin,
|
|
User
|
|
};
|
|
|
|
struct Module : _ze_module_handle_t {
|
|
|
|
static Module *create(Device *device, const ze_module_desc_t *desc, ModuleBuildLog *moduleBuildLog, ModuleType type, ze_result_t *result);
|
|
|
|
virtual ~Module() = default;
|
|
|
|
virtual Device *getDevice() const = 0;
|
|
|
|
virtual ze_result_t createKernel(const ze_kernel_desc_t *desc,
|
|
ze_kernel_handle_t *kernelHandle) = 0;
|
|
virtual ze_result_t destroy() = 0;
|
|
virtual ze_result_t getNativeBinary(size_t *pSize, uint8_t *pModuleNativeBinary) = 0;
|
|
virtual ze_result_t getFunctionPointer(const char *pKernelName, void **pfnFunction) = 0;
|
|
virtual ze_result_t getGlobalPointer(const char *pGlobalName, size_t *pSize, void **pPtr) = 0;
|
|
virtual ze_result_t getDebugInfo(size_t *pDebugDataSize, uint8_t *pDebugData) = 0;
|
|
virtual ze_result_t getKernelNames(uint32_t *pCount, const char **pNames) = 0;
|
|
virtual ze_result_t getProperties(ze_module_properties_t *pModuleProperties) = 0;
|
|
virtual ze_result_t performDynamicLink(uint32_t numModules,
|
|
ze_module_handle_t *phModules,
|
|
ze_module_build_log_handle_t *phLinkLog) = 0;
|
|
|
|
virtual const KernelImmutableData *getKernelImmutableData(const char *kernelName) const = 0;
|
|
virtual const std::vector<std::unique_ptr<KernelImmutableData>> &getKernelImmutableDataVector() const = 0;
|
|
virtual uint32_t getMaxGroupSize(const NEO::KernelDescriptor &kernelDescriptor) const = 0;
|
|
virtual bool isDebugEnabled() const = 0;
|
|
virtual bool shouldAllocatePrivateMemoryPerDispatch() const = 0;
|
|
virtual uint32_t getProfileFlags() const = 0;
|
|
virtual void checkIfPrivateMemoryPerDispatchIsNeeded() = 0;
|
|
|
|
Module() = default;
|
|
Module(const Module &) = delete;
|
|
Module(Module &&) = delete;
|
|
Module &operator=(const Module &) = delete;
|
|
Module &operator=(Module &&) = delete;
|
|
|
|
static Module *fromHandle(ze_module_handle_t handle) { return static_cast<Module *>(handle); }
|
|
|
|
inline ze_module_handle_t toHandle() { return this; }
|
|
};
|
|
|
|
} // namespace L0
|