Use device from API function in clGetKernelWorkGroupInfo/SubGroupInfo

store execution environment reference in Kernel class Related-To: NEO-5001 Signed-off-by: Mateusz Jablonski <mateusz.jablonski@intel.com>
2025-09-15 13:01:45 +08:00 · 2020-11-24 14:20:33 +00:00
parent ae3ad3e8bc
commit 52d96af5f0
5 changed files with 42 additions and 32 deletions
--- a/opencl/source/api/api.cpp
+++ b/opencl/source/api/api.cpp
@ -1918,15 +1918,19 @@ cl_int CL_API_CALL clGetKernelWorkGroupInfo(cl_kernel kernel,
                   "paramValue", NEO::FileLoggerInstance().infoPointerToString(paramValue, paramValueSize),
                   "paramValueSizeRet", paramValueSizeRet);

-    auto pKernel = castToObject<Kernel>(kernel);
-    retVal = pKernel
-                 ? pKernel->getWorkGroupInfo(
-                       device,
-                       paramName,
-                       paramValueSize,
-                       paramValue,
-                       paramValueSizeRet)
-                 : CL_INVALID_KERNEL;
+    Kernel *pKernel = nullptr;
+    ClDevice *pClDevice = nullptr;
+    retVal = validateObjects(WithCastToInternal(device, &pClDevice),
+                             WithCastToInternal(kernel, &pKernel));
+
+    if (CL_SUCCESS == retVal) {
+        retVal = pKernel->getWorkGroupInfo(
+            *pClDevice,
+            paramName,
+            paramValueSize,
+            paramValue,
+            paramValueSizeRet);
+    }
    TRACING_EXIT(clGetKernelWorkGroupInfo, &retVal);
    return retVal;
 }
@ -5067,7 +5071,8 @@ cl_int CL_API_CALL clGetKernelSubGroupInfoKHR(cl_kernel kernel,
                   "paramValueSizeRet", paramValueSizeRet);

    Kernel *pKernel = nullptr;
-    retVal = validateObjects(device,
+    ClDevice *pClDevice = nullptr;
+    retVal = validateObjects(WithCastToInternal(device, &pClDevice),
                             WithCastToInternal(kernel, &pKernel));

    if (CL_SUCCESS != retVal) {
@ -5078,7 +5083,7 @@ cl_int CL_API_CALL clGetKernelSubGroupInfoKHR(cl_kernel kernel,
    case CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE:
    case CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE:
    case CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL:
-        return pKernel->getSubGroupInfo(paramName,
+        return pKernel->getSubGroupInfo(*pClDevice, paramName,
                                        inputValueSize, inputValue,
                                        paramValueSize, paramValue,
                                        paramValueSizeRet);
@ -5167,7 +5172,8 @@ cl_int CL_API_CALL clGetKernelSubGroupInfo(cl_kernel kernel,
                   "paramValueSizeRet", paramValueSizeRet);

    Kernel *pKernel = nullptr;
-    retVal = validateObjects(device,
+    ClDevice *pClDevice = nullptr;
+    retVal = validateObjects(WithCastToInternal(device, &pClDevice),
                             WithCastToInternal(kernel, &pKernel));

    if (CL_SUCCESS != retVal) {
@ -5175,7 +5181,7 @@ cl_int CL_API_CALL clGetKernelSubGroupInfo(cl_kernel kernel,
        return retVal;
    }

-    retVal = pKernel->getSubGroupInfo(paramName,
+    retVal = pKernel->getSubGroupInfo(*pClDevice, paramName,
                                      inputValueSize, inputValue,
                                      paramValueSize, paramValue,
                                      paramValueSizeRet);
--- a/opencl/source/kernel/kernel.cpp
+++ b/opencl/source/kernel/kernel.cpp
@ -68,6 +68,7 @@ Kernel::Kernel(Program *programArg, const KernelInfo &kernelInfoArg, bool schedu
    : slmTotalSize(kernelInfoArg.workloadInfo.slmStaticSize),
      isParentKernel((kernelInfoArg.patchInfo.executionEnvironment != nullptr) ? (kernelInfoArg.patchInfo.executionEnvironment->HasDeviceEnqueue != 0) : false),
      isSchedulerKernel(schedulerKernel),
+      executionEnvironment(programArg->getExecutionEnvironment()),
      program(programArg),
      deviceVector(programArg->getDevices()),
      kernelInfo(kernelInfoArg) {
@ -256,7 +257,7 @@ cl_int Kernel::initialize() {
                retVal = CL_OUT_OF_RESOURCES;
                break;
            }
-            kernelDeviceInfos[rootDeviceIndex].privateSurface = getDevice().getMemoryManager()->allocateGraphicsMemoryWithProperties(
+            kernelDeviceInfos[rootDeviceIndex].privateSurface = executionEnvironment.memoryManager->allocateGraphicsMemoryWithProperties(
                {rootDeviceIndex,
                 static_cast<size_t>(kernelDeviceInfos[rootDeviceIndex].privateSurfaceSize),
                 GraphicsAllocation::AllocationType::PRIVATE_SURFACE,
@ -551,7 +552,7 @@ cl_int Kernel::getArgInfo(cl_uint argIndx, cl_kernel_arg_info paramName, size_t
    return retVal;
 }

-cl_int Kernel::getWorkGroupInfo(cl_device_id device, cl_kernel_work_group_info paramName,
+cl_int Kernel::getWorkGroupInfo(ClDevice &device, cl_kernel_work_group_info paramName,
                                size_t paramValueSize, void *paramValue,
                                size_t *paramValueSizeRet) const {
    cl_int retVal = CL_INVALID_VALUE;
@ -566,7 +567,7 @@ cl_int Kernel::getWorkGroupInfo(cl_device_id device, cl_kernel_work_group_info p
    cl_ulong scratchSize;
    cl_ulong privateMemSize;
    size_t maxWorkgroupSize;
-    const auto &hwInfo = getDevice().getHardwareInfo();
+    const auto &hwInfo = device.getHardwareInfo();
    auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
    GetInfoHelper info(paramValue, paramValueSize, paramValueSizeRet);

@ -630,7 +631,7 @@ cl_int Kernel::getWorkGroupInfo(cl_device_id device, cl_kernel_work_group_info p
    return retVal;
 }

-cl_int Kernel::getSubGroupInfo(cl_kernel_sub_group_info paramName,
+cl_int Kernel::getSubGroupInfo(ClDevice &clDevice, cl_kernel_sub_group_info paramName,
                               size_t inputValueSize, const void *inputValue,
                               size_t paramValueSize, void *paramValue,
                               size_t *paramValueSizeRet) const {
@ -660,7 +661,7 @@ cl_int Kernel::getSubGroupInfo(cl_kernel_sub_group_info paramName,
        }
        numDimensions = inputValueSize / sizeof(size_t);
        if (numDimensions == 0 ||
-            numDimensions > static_cast<size_t>(getDevice().getDeviceInfo().maxWorkItemDimensions)) {
+            numDimensions > static_cast<size_t>(clDevice.getDeviceInfo().maxWorkItemDimensions)) {
            return CL_INVALID_VALUE;
        }
    }
@ -674,7 +675,7 @@ cl_int Kernel::getSubGroupInfo(cl_kernel_sub_group_info paramName,
        }
        numDimensions = paramValueSize / sizeof(size_t);
        if (numDimensions == 0 ||
-            numDimensions > static_cast<size_t>(getDevice().getDeviceInfo().maxWorkItemDimensions)) {
+            numDimensions > static_cast<size_t>(clDevice.getDeviceInfo().maxWorkItemDimensions)) {
            return CL_INVALID_VALUE;
        }
    }
@ -749,7 +750,7 @@ void Kernel::substituteKernelHeap(void *newKernelHeap, size_t newKernelHeapSize)
    auto &heapInfo = pKernelInfo->heapInfo;
    heapInfo.KernelHeapSize = static_cast<uint32_t>(newKernelHeapSize);
    pKernelInfo->isKernelHeapSubstituted = true;
-    auto memoryManager = getDevice().getMemoryManager();
+    auto memoryManager = executionEnvironment.memoryManager.get();

    auto currentAllocationSize = pKernelInfo->kernelAllocation->getUnderlyingBufferSize();
    bool status = false;
@ -1068,7 +1069,7 @@ inline void Kernel::makeArgsResident(CommandStreamReceiver &commandStreamReceive
        if (kernelArguments[argIndex].object) {
            if (kernelArguments[argIndex].type == SVM_ALLOC_OBJ) {
                auto pSVMAlloc = (GraphicsAllocation *)kernelArguments[argIndex].object;
-                auto pageFaultManager = getDevice().getMemoryManager()->getPageFaultManager();
+                auto pageFaultManager = executionEnvironment.memoryManager->getPageFaultManager();
                if (pageFaultManager &&
                    this->isUnifiedMemorySyncRequired) {
                    pageFaultManager->moveAllocationToGpuDomain(reinterpret_cast<void *>(pSVMAlloc->getGpuAddress()));
@ -1703,7 +1704,7 @@ void Kernel::createReflectionSurface() {
        kernelReflectionSize += blockCount * alignUp(maxConstantBufferSize, sizeof(void *));
        kernelReflectionSize += parentImageCount * sizeof(IGIL_ImageParamters);
        kernelReflectionSize += parentSamplerCount * sizeof(IGIL_ParentSamplerParams);
-        kernelReflectionSurface = getDevice().getMemoryManager()->allocateGraphicsMemoryWithProperties({getDevice().getRootDeviceIndex(), kernelReflectionSize, GraphicsAllocation::AllocationType::DEVICE_QUEUE_BUFFER, getDevice().getDeviceBitfield()});
+        kernelReflectionSurface = executionEnvironment.memoryManager->allocateGraphicsMemoryWithProperties({getDevice().getRootDeviceIndex(), kernelReflectionSize, GraphicsAllocation::AllocationType::DEVICE_QUEUE_BUFFER, getDevice().getDeviceBitfield()});

        for (uint32_t i = 0; i < blockCount; i++) {
            const KernelInfo *pBlockInfo = blockManager->getBlockKernelInfo(i);
@ -1777,7 +1778,7 @@ void Kernel::createReflectionSurface() {

    if (DebugManager.flags.ForceDispatchScheduler.get()) {
        if (this->isSchedulerKernel && kernelReflectionSurface == nullptr) {
-            kernelReflectionSurface = getDevice().getMemoryManager()->allocateGraphicsMemoryWithProperties({getDevice().getRootDeviceIndex(), MemoryConstants::pageSize, GraphicsAllocation::AllocationType::DEVICE_QUEUE_BUFFER, getDevice().getDeviceBitfield()});
+            kernelReflectionSurface = executionEnvironment.memoryManager->allocateGraphicsMemoryWithProperties({getDevice().getRootDeviceIndex(), MemoryConstants::pageSize, GraphicsAllocation::AllocationType::DEVICE_QUEUE_BUFFER, getDevice().getDeviceBitfield()});
        }
    }
 }
--- a/opencl/source/kernel/kernel.h
+++ b/opencl/source/kernel/kernel.h
@ -17,6 +17,7 @@

 #include "opencl/extensions/public/cl_ext_private.h"
 #include "opencl/source/api/cl_types.h"
+#include "opencl/source/cl_device/cl_device.h"
 #include "opencl/source/device_queue/device_queue.h"
 #include "opencl/source/helpers/base_object.h"
 #include "opencl/source/helpers/properties_helper.h"
@ -143,10 +144,10 @@ class Kernel : public BaseObject<_cl_kernel> {
    cl_int getArgInfo(cl_uint argIndx, cl_kernel_arg_info paramName,
                      size_t paramValueSize, void *paramValue, size_t *paramValueSizeRet) const;

-    cl_int getWorkGroupInfo(cl_device_id device, cl_kernel_work_group_info paramName,
+    cl_int getWorkGroupInfo(ClDevice &clDevice, cl_kernel_work_group_info paramName,
                            size_t paramValueSize, void *paramValue, size_t *paramValueSizeRet) const;

-    cl_int getSubGroupInfo(cl_kernel_sub_group_info paramName,
+    cl_int getSubGroupInfo(ClDevice &device, cl_kernel_sub_group_info paramName,
                           size_t inputValueSize, const void *inputValue,
                           size_t paramValueSize, void *paramValue,
                           size_t *paramValueSizeRet) const;
@ -509,7 +510,7 @@ class Kernel : public BaseObject<_cl_kernel> {
    const ClDevice &getDevice() const {
        return *deviceVector[0];
    }
-
+    const ExecutionEnvironment &executionEnvironment;
    Program *program;
    const ClDeviceVector &deviceVector;
    const KernelInfo &kernelInfo;
--- a/opencl/source/program/program.h
+++ b/opencl/source/program/program.h
@ -275,6 +275,8 @@ class Program : public BaseObject<_cl_program> {
        return 0 != exposedKernels;
    }

+    const ExecutionEnvironment &getExecutionEnvironment() const { return executionEnvironment; }
+
  protected:
    MOCKABLE_VIRTUAL cl_int createProgramFromBinary(const void *pBinary, size_t binarySize, ClDevice &clDevice);

--- a/opencl/test/unit_test/kernel/kernel_tests.cpp
+++ b/opencl/test/unit_test/kernel/kernel_tests.cpp
@ -281,7 +281,7 @@ TEST_P(KernelTest, GivenKernelWorkGroupSizeWhenGettingWorkGroupInfoThenWorkGroup
    pKernel->maxKernelWorkGroupSize = static_cast<uint32_t>(kernelMaxWorkGroupSize);

    retVal = pKernel->getWorkGroupInfo(
-        pClDevice,
+        *pClDevice,
        paramName,
        paramValueSize,
        &paramValue,
@ -299,7 +299,7 @@ TEST_P(KernelTest, GivenKernelCompileWorkGroupSizeWhenGettingWorkGroupInfoThenCo
    size_t paramValueSizeRet = 0;

    retVal = pKernel->getWorkGroupInfo(
-        pClDevice,
+        *pClDevice,
        paramName,
        paramValueSize,
        &paramValue,
@ -313,7 +313,7 @@ TEST_P(KernelTest, GivenInvalidParamNameWhenGettingWorkGroupInfoThenInvalidValue
    size_t paramValueSizeRet = 0x1234u;

    retVal = pKernel->getWorkGroupInfo(
-        pClDevice,
+        *pClDevice,
        0,
        0,
        nullptr,
@ -2653,13 +2653,13 @@ TEST(KernelTest, givenKernelWhenDebugFlagToUseMaxSimdForCalculationsIsUsedThenMa
    kernel.executionEnvironment.LargestCompiledSIMDSize = CommonConstants::maximalSimdSize;

    size_t maxKernelWkgSize;
-    kernel.mockKernel->getWorkGroupInfo(device.get(), CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &maxKernelWkgSize, nullptr);
+    kernel.mockKernel->getWorkGroupInfo(*device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &maxKernelWkgSize, nullptr);
    EXPECT_EQ(1024u, maxKernelWkgSize);
    kernel.executionEnvironment.LargestCompiledSIMDSize = 16;
-    kernel.mockKernel->getWorkGroupInfo(device.get(), CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &maxKernelWkgSize, nullptr);
+    kernel.mockKernel->getWorkGroupInfo(*device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &maxKernelWkgSize, nullptr);
    EXPECT_EQ(512u, maxKernelWkgSize);
    kernel.executionEnvironment.LargestCompiledSIMDSize = 8;
-    kernel.mockKernel->getWorkGroupInfo(device.get(), CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &maxKernelWkgSize, nullptr);
+    kernel.mockKernel->getWorkGroupInfo(*device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &maxKernelWkgSize, nullptr);
    EXPECT_EQ(256u, maxKernelWkgSize);
 }