compute-runtime/runtime/command_queue/enqueue_kernel.h

/*
 * Copyright (C) 2017-2019 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
 */

#pragma once
#include "runtime/built_ins/builtins_dispatch_builder.h"
#include "runtime/command_queue/command_queue_hw.h"
#include "runtime/command_queue/gpgpu_walker.h"
#include "runtime/command_stream/command_stream_receiver.h"
#include "runtime/helpers/kernel_commands.h"
#include "runtime/helpers/task_information.h"
#include "runtime/mem_obj/buffer.h"
#include "runtime/memory_manager/surface.h"

#include "hw_cmds.h"

#include <new>

namespace OCLRT {

template <typename GfxFamily>
cl_int CommandQueueHw<GfxFamily>::enqueueKernel(
    cl_kernel clKernel,
    cl_uint workDim,
    const size_t *globalWorkOffsetIn,
    const size_t *globalWorkSizeIn,
    const size_t *localWorkSizeIn,
    cl_uint numEventsInWaitList,
    const cl_event *eventWaitList,
    cl_event *event) {

    size_t region[3] = {1, 1, 1};
    size_t globalWorkOffset[3] = {0, 0, 0};
    size_t workGroupSize[3] = {1, 1, 1};

    auto &kernel = *castToObject<Kernel>(clKernel);
    const auto &kernelInfo = kernel.getKernelInfo();

    if (!kernel.isPatched()) {
        if (event) {
            *event = nullptr;
        }

        return CL_INVALID_KERNEL_ARGS;
    }

    if (kernel.isUsingSharedObjArgs()) {
        kernel.resetSharedObjectsPatchAddresses();
    }

    bool haveRequiredWorkGroupSize = false;

    if (kernelInfo.reqdWorkGroupSize[0] != WorkloadInfo::undefinedOffset) {
        haveRequiredWorkGroupSize = true;
    }

    size_t remainder = 0;
    size_t totalWorkItems = 1u;
    const size_t *localWkgSizeToPass = localWorkSizeIn ? workGroupSize : nullptr;

    for (auto i = 0u; i < workDim; i++) {
        region[i] = globalWorkSizeIn ? globalWorkSizeIn[i] : 0;
        globalWorkOffset[i] = globalWorkOffsetIn
                                  ? globalWorkOffsetIn[i]
                                  : 0;

        if (localWorkSizeIn) {
            if (haveRequiredWorkGroupSize) {
                if (kernelInfo.reqdWorkGroupSize[i] != localWorkSizeIn[i]) {
                    return CL_INVALID_WORK_GROUP_SIZE;
                }
            }
            if (localWorkSizeIn[i] == 0) {
                return CL_INVALID_WORK_GROUP_SIZE;
            }
            if (kernel.getAllowNonUniform()) {
                workGroupSize[i] = std::min(localWorkSizeIn[i], globalWorkSizeIn[i]);
            } else {
                workGroupSize[i] = localWorkSizeIn[i];
            }
            totalWorkItems *= localWorkSizeIn[i];
        }

        remainder += region[i] % workGroupSize[i];
    }

    if (remainder != 0 && !kernel.getAllowNonUniform()) {
        return CL_INVALID_WORK_GROUP_SIZE;
    }

    if (haveRequiredWorkGroupSize) {
        localWkgSizeToPass = kernelInfo.reqdWorkGroupSize;
    }

    NullSurface s;
    Surface *surfaces[] = {&s};

    if (context->isProvidingPerformanceHints()) {
        if (kernel.hasPrintfOutput()) {
            context->providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL, PRINTF_DETECTED_IN_KERNEL, kernel.getKernelInfo().name.c_str());
        }
        if (kernel.requiresCoherency()) {
            context->providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL, KERNEL_REQUIRES_COHERENCY, kernel.getKernelInfo().name.c_str());
        }
    }

    if (kernel.getKernelInfo().builtinDispatchBuilder != nullptr) {
        cl_int err = kernel.getKernelInfo().builtinDispatchBuilder->validateDispatch(&kernel, workDim, Vec3<size_t>(region), Vec3<size_t>(workGroupSize), Vec3<size_t>(globalWorkOffset));
        if (err != CL_SUCCESS)
            return err;
    }

    DBG_LOG(PrintDispatchParameters, "Kernel: ", kernel.getKernelInfo().name,
            ",LWS:, ", localWorkSizeIn ? localWorkSizeIn[0] : 0,
            ",", localWorkSizeIn ? localWorkSizeIn[1] : 0,
            ",", localWorkSizeIn ? localWorkSizeIn[2] : 0,
            ",GWS:,", globalWorkSizeIn[0],
            ",", globalWorkSizeIn[1],
            ",", globalWorkSizeIn[2],
            ",SIMD:, ", kernel.getKernelInfo().getMaxSimdSize());

    if (totalWorkItems > this->getDevice().getDeviceInfo().maxWorkGroupSize) {
        return CL_INVALID_WORK_GROUP_SIZE;
    }

    enqueueHandler<CL_COMMAND_NDRANGE_KERNEL>(
        surfaces,
        false,
        &kernel,
        workDim,
        globalWorkOffset,
        region,
        localWkgSizeToPass,
        numEventsInWaitList,
        eventWaitList,
        event);

    return CL_SUCCESS;
}
} // namespace OCLRT
Initial commit Change-Id: I4bf1707bd3dfeadf2c17b0a7daff372b1925ebbd 2017-12-21 07:45:38 +08:00			`/*`
Limiting the value of LWS to the value of GWS Change-Id: I24e89125e586ed77d396ba9e40dd039f1ab213fe 2019-01-30 21:09:58 +08:00			`* Copyright (C) 2017-2019 Intel Corporation`
Initial commit Change-Id: I4bf1707bd3dfeadf2c17b0a7daff372b1925ebbd 2017-12-21 07:45:38 +08:00			`*`
Update copyright headers Updating files modified in 2018 only. Older files remain with old style copyright header Change-Id: Ic99f2e190ad74b4b7f2bd79dd7b9fa5fbe36ec92 Signed-off-by: Artur Harasimiuk <artur.harasimiuk@intel.com> 2018-09-18 15:11:08 +08:00			`* SPDX-License-Identifier: MIT`
Initial commit Change-Id: I4bf1707bd3dfeadf2c17b0a7daff372b1925ebbd 2017-12-21 07:45:38 +08:00			`*`
			`*/`

			`#pragma once`
separate BuiltinDispatchInfoBuilder from built_ins.h We don't need BuiltinDispatchInfoBuilder in every place where built ins are used. specifically in .cpp files generated from kernel binary. Change-Id: Ie739951cdc93873993f78ad14cee656122af51fd Signed-off-by: Artur Harasimiuk <artur.harasimiuk@intel.com> 2018-04-18 20:59:28 +08:00			`#include "runtime/built_ins/builtins_dispatch_builder.h"`
Initial commit Change-Id: I4bf1707bd3dfeadf2c17b0a7daff372b1925ebbd 2017-12-21 07:45:38 +08:00			`#include "runtime/command_queue/command_queue_hw.h"`
Create GpgpuWalkerHelper class Change-Id: Ia9aa7b816356aff57234b46ea3509b6bd9b7f14b 2018-03-30 23:57:51 +08:00			`#include "runtime/command_queue/gpgpu_walker.h"`
clang-format: enable sorting includes Include files are now grouped and sorted in following order: 1. Header file of the class the current file implements 2. Project files 3. Third party files 4. Standard library Change-Id: If31af05652184169f7fee1d7ad08f1b2ed602cf0 Signed-off-by: Filip Hazubski <filip.hazubski@intel.com> 2019-02-27 18:39:32 +08:00			`#include "runtime/command_stream/command_stream_receiver.h"`
Initial commit Change-Id: I4bf1707bd3dfeadf2c17b0a7daff372b1925ebbd 2017-12-21 07:45:38 +08:00			`#include "runtime/helpers/kernel_commands.h"`
			`#include "runtime/helpers/task_information.h"`
			`#include "runtime/mem_obj/buffer.h"`
			`#include "runtime/memory_manager/surface.h"`
clang-format: enable sorting includes Include files are now grouped and sorted in following order: 1. Header file of the class the current file implements 2. Project files 3. Third party files 4. Standard library Change-Id: If31af05652184169f7fee1d7ad08f1b2ed602cf0 Signed-off-by: Filip Hazubski <filip.hazubski@intel.com> 2019-02-27 18:39:32 +08:00
			`#include "hw_cmds.h"`

Initial commit Change-Id: I4bf1707bd3dfeadf2c17b0a7daff372b1925ebbd 2017-12-21 07:45:38 +08:00			`#include <new>`

			`namespace OCLRT {`

			`template <typename GfxFamily>`
			`cl_int CommandQueueHw<GfxFamily>::enqueueKernel(`
			`cl_kernel clKernel,`
			`cl_uint workDim,`
			`const size_t *globalWorkOffsetIn,`
			`const size_t *globalWorkSizeIn,`
			`const size_t *localWorkSizeIn,`
			`cl_uint numEventsInWaitList,`
			`const cl_event *eventWaitList,`
			`cl_event *event) {`

			`size_t region[3] = {1, 1, 1};`
			`size_t globalWorkOffset[3] = {0, 0, 0};`
			`size_t workGroupSize[3] = {1, 1, 1};`

			`auto &kernel = *castToObject<Kernel>(clKernel);`
			`const auto &kernelInfo = kernel.getKernelInfo();`

			`if (!kernel.isPatched()) {`
			`if (event) {`
			`*event = nullptr;`
			`}`

			`return CL_INVALID_KERNEL_ARGS;`
			`}`

			`if (kernel.isUsingSharedObjArgs()) {`
			`kernel.resetSharedObjectsPatchAddresses();`
			`}`

			`bool haveRequiredWorkGroupSize = false;`

			`if (kernelInfo.reqdWorkGroupSize[0] != WorkloadInfo::undefinedOffset) {`
			`haveRequiredWorkGroupSize = true;`
			`}`

			`size_t remainder = 0;`
Add check for local work group size in clEnqueueNDRangeKernel call. - Incoming local work group size cannot exceed device capabilities. Change-Id: I89a7503155c71443e3ebc630debb5d5b466c6cb5 2018-04-20 13:58:48 +08:00			`size_t totalWorkItems = 1u;`
Initial commit Change-Id: I4bf1707bd3dfeadf2c17b0a7daff372b1925ebbd 2017-12-21 07:45:38 +08:00			`const size_t *localWkgSizeToPass = localWorkSizeIn ? workGroupSize : nullptr;`

			`for (auto i = 0u; i < workDim; i++) {`
			`region[i] = globalWorkSizeIn ? globalWorkSizeIn[i] : 0;`
			`globalWorkOffset[i] = globalWorkOffsetIn`
			`? globalWorkOffsetIn[i]`
			`: 0;`

			`if (localWorkSizeIn) {`
			`if (haveRequiredWorkGroupSize) {`
			`if (kernelInfo.reqdWorkGroupSize[i] != localWorkSizeIn[i]) {`
			`return CL_INVALID_WORK_GROUP_SIZE;`
			`}`
			`}`
Fix division by zero in enqueueKernel Change-Id: I8e7d3db39805133a5af545e65a94fb19433a2a41 2018-08-13 21:58:57 +08:00			`if (localWorkSizeIn[i] == 0) {`
			`return CL_INVALID_WORK_GROUP_SIZE;`
			`}`
Limiting the value of LWS to the value of GWS Change-Id: I24e89125e586ed77d396ba9e40dd039f1ab213fe 2019-01-30 21:09:58 +08:00			`if (kernel.getAllowNonUniform()) {`
			`workGroupSize[i] = std::min(localWorkSizeIn[i], globalWorkSizeIn[i]);`
			`} else {`
			`workGroupSize[i] = localWorkSizeIn[i];`
			`}`
Add check for local work group size in clEnqueueNDRangeKernel call. - Incoming local work group size cannot exceed device capabilities. Change-Id: I89a7503155c71443e3ebc630debb5d5b466c6cb5 2018-04-20 13:58:48 +08:00			`totalWorkItems *= localWorkSizeIn[i];`
Initial commit Change-Id: I4bf1707bd3dfeadf2c17b0a7daff372b1925ebbd 2017-12-21 07:45:38 +08:00			`}`

			`remainder += region[i] % workGroupSize[i];`
			`}`

			`if (remainder != 0 && !kernel.getAllowNonUniform()) {`
			`return CL_INVALID_WORK_GROUP_SIZE;`
			`}`

			`if (haveRequiredWorkGroupSize) {`
			`localWkgSizeToPass = kernelInfo.reqdWorkGroupSize;`
			`}`

			`NullSurface s;`
			`Surface *surfaces[] = {&s};`

			`if (context->isProvidingPerformanceHints()) {`
			`if (kernel.hasPrintfOutput()) {`
			`context->providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL, PRINTF_DETECTED_IN_KERNEL, kernel.getKernelInfo().name.c_str());`
			`}`
			`if (kernel.requiresCoherency()) {`
			`context->providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL, KERNEL_REQUIRES_COHERENCY, kernel.getKernelInfo().name.c_str());`
			`}`
			`}`

			`if (kernel.getKernelInfo().builtinDispatchBuilder != nullptr) {`
			`cl_int err = kernel.getKernelInfo().builtinDispatchBuilder->validateDispatch(&kernel, workDim, Vec3<size_t>(region), Vec3<size_t>(workGroupSize), Vec3<size_t>(globalWorkOffset));`
			`if (err != CL_SUCCESS)`
			`return err;`
			`}`

Add debug flag to dump dispatch parameters. - Also refactor debug manager tests , they now check for default value in igdrcl.config file - There is no need to write dedicated tests now , so I remove them. Change-Id: Ib338ca05b6059302c29469c673239e7886dc4b9b 2018-03-16 17:12:38 +08:00			`DBG_LOG(PrintDispatchParameters, "Kernel: ", kernel.getKernelInfo().name,`
			`",LWS:, ", localWorkSizeIn ? localWorkSizeIn[0] : 0,`
			`",", localWorkSizeIn ? localWorkSizeIn[1] : 0,`
			`",", localWorkSizeIn ? localWorkSizeIn[2] : 0,`
			`",GWS:,", globalWorkSizeIn[0],`
			`",", globalWorkSizeIn[1],`
			`",", globalWorkSizeIn[2],`
			`",SIMD:, ", kernel.getKernelInfo().getMaxSimdSize());`

Add check for local work group size in clEnqueueNDRangeKernel call. - Incoming local work group size cannot exceed device capabilities. Change-Id: I89a7503155c71443e3ebc630debb5d5b466c6cb5 2018-04-20 13:58:48 +08:00			`if (totalWorkItems > this->getDevice().getDeviceInfo().maxWorkGroupSize) {`
			`return CL_INVALID_WORK_GROUP_SIZE;`
			`}`

Initial commit Change-Id: I4bf1707bd3dfeadf2c17b0a7daff372b1925ebbd 2017-12-21 07:45:38 +08:00			`enqueueHandler<CL_COMMAND_NDRANGE_KERNEL>(`
			`surfaces,`
			`false,`
			`&kernel,`
			`workDim,`
			`globalWorkOffset,`
			`region,`
			`localWkgSizeToPass,`
			`numEventsInWaitList,`
			`eventWaitList,`
			`event);`

			`return CL_SUCCESS;`
			`}`
			`} // namespace OCLRT`